# Preprocessing notebook for task 2
- The sector of the company has an influence on the type of tweet generated.
- In this notebook, we find out the sector of the company using wikidata queries

## Installing and importing the necessary libraries

In [None]:
!pip install pandas
!pip install requests
import pandas as pd
import requests

## Loading the training dataset to a dataframe
 Replace 'behaviour_simulation_train.xlsx' with the appropriate file path of the training dataset.

In [None]:
ds=pd.read_excel('behaviour_simulation_train.xlsx')

## Extracting the unique company names

In [None]:
companies = ds['inferred company'].unique()

## A dictionary is created which will map the company to it's sector

In [None]:
industry = dict()
for company in companies:
    industry[company]=[]

## Function to return the list of sectors associated with a company
 This function searches for the company by its name and inputs its qid to **get_industries_for_qid()** to finally return the list of industries the company works in.

In [None]:
def get_entity_industries(entity_name, entity_type="item"):
    # Wikidata endpoint for entity search
    endpoint_url = "https://www.wikidata.org/w/api.php"

    # Wikidata query to search for an entity by name and type
    search_query = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "en",
        "search": entity_name,
        "type": entity_type
    }

    # Sending the request to Wikidata
    response = requests.get(url=endpoint_url, params=search_query)

    # Handling the response
    try:
        data = response.json()
        if data["search"]:
            # Loop through search results to find the best match
            for result in data["search"]:
                qid = result["id"]
                industries = get_industries_for_qid(qid)
                if industries:
                    return industries

        print(f"No matching entity found for {entity_name}")
        return None

    except (KeyError, IndexError, requests.RequestException) as e:
        print(f"Error: {e}")
        return None


## Function to find the sectors of a company
 This function takes in the company qid and searches for the industries/sectors the company is associated with. The output of the search is the qid of industry and this qid is the input to **get_label_for_qid()** which returns the name(label) of the industry/sector.

In [None]:

def get_industries_for_qid(qid):
    # Wikidata endpoint for entity data
    endpoint_url = "https://www.wikidata.org/w/api.php"

    # Wikidata query to retrieve information about the entity, including its industry
    entity_query = {
        "action": "wbgetentities",
        "format": "json",
        "ids": qid,
        "languages": "en"
    }

    # Sending the request to Wikidata
    response = requests.get(url=endpoint_url, params=entity_query)

    try:
        entity_data = response.json()
        # Extracting all industry statements (P452 property or a more general property)
        industry_statements = entity_data["entities"][qid]["claims"].get("P452", [])
        if not industry_statements:
            industry_statements = entity_data["entities"][qid]["claims"].get("PXXX", [])  # Use a more general property

        # Collecting industry labels for all statements
        industries = []
        for statement in industry_statements:
            industry_qid = statement["mainsnak"]["datavalue"]["value"]["id"]
            industry_label = get_label_for_qid(industry_qid)
            if industry_label:
                industries.append(industry_label)

        return industries

    except (KeyError, IndexError, requests.RequestException) as e:
        print(f"Error: {e}")
        return None

## Function to return the label of a sector
This function takes in the qid and returns the label associated with it

In [None]:
def get_label_for_qid(qid):
    # Wikidata endpoint for entity labels
    endpoint_url = "https://www.wikidata.org/w/api.php"

    # Wikidata query to retrieve the label for the industry
    label_query = {
        "action": "wbgetentities",
        "format": "json",
        "ids": qid,
        "languages": "en"
    }

    # Sending the request to Wikidata
    response = requests.get(url=endpoint_url, params=label_query)

    try:
        label = response.json()["entities"][qid]["labels"]["en"]["value"]
        return label

    except (KeyError, IndexError, requests.RequestException) as e:
        print(f"Error: {e}")
        return None

## A company may be associated with multiple sectors

In [None]:
for key in industry.keys():
    st=''
    for s in industry[key]:
        st+=(s)+', '
    if len(st)>2:
        industry[key] = st[:-2]

## The industry dictionary is converted to a dataframe

In [None]:
df = pd.DataFrame(list(industry.items()), columns=['Company', 'Sector'])

## Converting the dataframe back to an excel sheet

In [None]:
df.to_excel('company2sector.xlsx')