# Consolidate NER Collection. 

In [1]:
# Python module. 
import os 
import numpy as np
import pandas as pd

# Change the current directory from (./notebook) to root directory. 
if os.getcwd().split("/")[-1] != "MADS-M2-estimating-news-impact-on-financial-market": 
	os.chdir("../..")

# For clearing safe warnings. Not important. 
from IPython.display import clear_output

# Custom configs. 
from source.config_py.config import DIR_DATASET, DIR_DATASET_COLLECTION, DIR_DATASET_LABELLINGS

# Preview. 
print(os.getcwd()) 

/Users/lioneltay/Dropbox/Courses/michigan_mads/SIADS_694_695_milestone_2_Eric_Gilbert/submission/MADS-M2-financial-news-personalisation


## Configurations (general). 

In [None]:
# Pandas DF config. 
pd.set_option("display.max_rows", 50, "display.max_columns", 50, "display.max_colwidth", 200)

# For clearing the output. Not important. 
clear_output()

## Load & consolidate NER datasets. 

In [3]:
# Need to set (encoding_errors=ignore). Otherwise will raise error due to special characters. 
# Fine to ignore those characters. Will not affect the analysis later. 
df1 = pd.read_csv(f"{DIR_DATASET_COLLECTION}/NER_economics_and_corp_event.csv", encoding="utf-8", encoding_errors="ignore")
df2 = pd.read_csv(f"{DIR_DATASET_COLLECTION}/NER_observance_and_commodities.csv", encoding="utf-8", encoding_errors="ignore")
df3 = pd.read_csv(f"{DIR_DATASET_COLLECTION}/NER_ws_event_and_earnings_and_sector.csv", encoding="utf-8", encoding_errors="ignore") 

# Consolidate the dataframes. 
df_ner_collection = pd.concat([df1, df2, df3]) 

# Preview. 
df_ner_collection

Unnamed: 0,sentence_id,sentence,categories,entities,notes
0,,"The FOMC has committed to using rates, not ass...",fomc_conference,economics,
1,,He noted that the FOMC made exceptional progre...,fomc_conference,economics,
2,,"""Many participants noted that one or more 50 b...",fomc_conference,economics,
3,,The FOMC meets eight times a year todiscuss mo...,fomc_conference,economics,
4,,The 12 members of theFOMCmeet eight times a ye...,fomc_conference,economics,
...,...,...,...,...,...
167,,TradeStation data shows that the S&P 500 Commu...,sector_telecomm,sector,
168,,"Facebook, Netflix and Alphabet, parent of Goog...",sector_telecomm,sector,
169,,Analysts do not expect a big market move when ...,sector_telecomm,sector,
170,,The new communications services industry will ...,sector_telecomm,sector,


## Process the NER dataset. 

In [4]:
# Eliminate white space at both end of the sentence. 
df_ner_collection["sentence"] = df_ner_collection["sentence"].str.strip() 

# Tag each sentence with an ID for mapping later. 
df_ner_collection["sentence_id"] = df_ner_collection.reset_index(drop=True).index

# Preview. 
df_ner_collection

Unnamed: 0,sentence_id,sentence,categories,entities,notes
0,0,"The FOMC has committed to using rates, not ass...",fomc_conference,economics,
1,1,He noted that the FOMC made exceptional progre...,fomc_conference,economics,
2,2,"""Many participants noted that one or more 50 b...",fomc_conference,economics,
3,3,The FOMC meets eight times a year todiscuss mo...,fomc_conference,economics,
4,4,The 12 members of theFOMCmeet eight times a ye...,fomc_conference,economics,
...,...,...,...,...,...
167,564,TradeStation data shows that the S&P 500 Commu...,sector_telecomm,sector,
168,565,"Facebook, Netflix and Alphabet, parent of Goog...",sector_telecomm,sector,
169,566,Analysts do not expect a big market move when ...,sector_telecomm,sector,
170,567,The new communications services industry will ...,sector_telecomm,sector,


In [5]:
# Restructure the dataframe to save in (JSONL) format for labelling later. 
df_ner_jsonl = pd.DataFrame()
df_ner_jsonl["sentence"] = df_ner_collection["sentence"] 
df_ner_jsonl["label"] = [[[]] for _ in range(len(df_ner_jsonl))] 

# Preview. 
df_ner_jsonl

Unnamed: 0,sentence,label
0,"The FOMC has committed to using rates, not ass...",[[]]
1,He noted that the FOMC made exceptional progre...,[[]]
2,"""Many participants noted that one or more 50 b...",[[]]
3,The FOMC meets eight times a year todiscuss mo...,[[]]
4,The 12 members of theFOMCmeet eight times a ye...,[[]]
...,...,...
167,TradeStation data shows that the S&P 500 Commu...,[[]]
168,"Facebook, Netflix and Alphabet, parent of Goog...",[[]]
169,Analysts do not expect a big market move when ...,[[]]
170,The new communications services industry will ...,[[]]


## Save the consolidated NER dataset. 

In [6]:
# Save in (CSV) to keep the record. 
df_ner_collection.to_csv(f"{DIR_DATASET_LABELLINGS}/NER_entities_collection.csv", index=False) 

# Save in (JSONL) for labelling later. 
df_ner_jsonl.to_json(f"{DIR_DATASET_LABELLINGS}/NER_entities_collection.json", orient="records", lines=True) 