# Process NER Labels. 

In [1]:
# Python module. 
import os 
import numpy as np
import pandas as pd
from collections import defaultdict

# Change the current directory from (./notebook) to root directory. 
if os.getcwd().split("/")[-1] != "MADS-M2-estimating-news-impact-on-financial-market": 
	os.chdir("../..")

# For clearing safe warnings. Not important. 
from IPython.display import clear_output

# Custom configs. 
from source.config_py.config import DIR_DATASET, DIR_DATASET_COLLECTION, DIR_DATASET_LABELLINGS

# Preview. 
print(os.getcwd()) 

/Users/lioneltay/Dropbox/Courses/michigan_mads/SIADS_694_695_milestone_2_Eric_Gilbert/submission/MADS-M2-financial-news-personalisation


## Configurations (general). 

In [2]:
# Pandas DF config. 
pd.set_option("display.max_rows", 50, "display.max_columns", 50, "display.max_colwidth", 50)

# For clearing the output. Not important. 
clear_output()

## Experimental preprocessing on sentence split and span. 

In [3]:
# Create random sentence to test the processing steps. 
df_entities_exm = pd.DataFrame({
	"id": [1], "sentence": ["This is a random; 'sentence', just for exploration 'only', don't bother-about/the grammer mistake."]
}) 
df_entities_exm["token"] = df_entities_exm["sentence"].str.split(r"\b") 

# Preview. 
df_entities_exm

Unnamed: 0,id,sentence,token
0,1,"This is a random; 'sentence', just for explora...","[, This, , is, , a, , random, ; ', sentence..."


In [4]:
# Expand the list of token into subsequent rows. 
df_token_exm = df_entities_exm.explode(column="token") 

# Preview. 
df_token_exm

Unnamed: 0,id,sentence,token
0,1,"This is a random; 'sentence', just for explora...",
0,1,"This is a random; 'sentence', just for explora...",This
0,1,"This is a random; 'sentence', just for explora...",
0,1,"This is a random; 'sentence', just for explora...",is
0,1,"This is a random; 'sentence', just for explora...",
0,1,"This is a random; 'sentence', just for explora...",a
0,1,"This is a random; 'sentence', just for explora...",
0,1,"This is a random; 'sentence', just for explora...",random
0,1,"This is a random; 'sentence', just for explora...",; '
0,1,"This is a random; 'sentence', just for explora...",sentence


In [5]:
# Get the str length to calculate span (str indices) at the start and end later. 
# We need span to extract the entities. Other third-party NLP pipelines 
# also require spans as input. 
df_token_exm["len"] = df_token_exm["token"].str.len()

# Get the span for each token for each sentence. 
df_token_exm["span_end"] = df_token_exm.groupby("id").agg(span_end=pd.NamedAgg(column="len", aggfunc=np.cumsum)) 
df_token_exm["span_beg"] = df_token_exm.groupby("id")["span_end"].shift(periods=1) 
df_token_exm["span_beg"] = df_token_exm["span_beg"].fillna(0.0).astype(int) 
df_token_exm["span"] = df_token_exm[["span_beg", "span_end"]].values.tolist() 

# Preview. 
df_token_exm 

Unnamed: 0,id,sentence,token,len,span_end,span_beg,span
0,1,"This is a random; 'sentence', just for explora...",,0,0,0,"[0, 0]"
0,1,"This is a random; 'sentence', just for explora...",This,4,4,0,"[0, 4]"
0,1,"This is a random; 'sentence', just for explora...",,1,5,4,"[4, 5]"
0,1,"This is a random; 'sentence', just for explora...",is,2,7,5,"[5, 7]"
0,1,"This is a random; 'sentence', just for explora...",,1,8,7,"[7, 8]"
0,1,"This is a random; 'sentence', just for explora...",a,1,9,8,"[8, 9]"
0,1,"This is a random; 'sentence', just for explora...",,1,10,9,"[9, 10]"
0,1,"This is a random; 'sentence', just for explora...",random,6,16,10,"[10, 16]"
0,1,"This is a random; 'sentence', just for explora...",; ',3,19,16,"[16, 19]"
0,1,"This is a random; 'sentence', just for explora...",sentence,8,27,19,"[19, 27]"


## Preprocessing on sentence split and span. 

In [6]:
df_entities = pd.read_json(f"{DIR_DATASET}/NER_entities_labelled.jsonl", orient="records", encoding="utf-8", encoding_errors="ignore", lines=True)

# Preview. 
df_entities

Unnamed: 0,id,text,relations,entities,label
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]]
1,2,He noted that the FOMC made exceptional progre...,"[{'id': 23, 'from_id': 3, 'to_id': 4951, 'type...","[{'id': 3, 'start_offset': 18, 'end_offset': 2...",[[]]
2,3,"""Many participants noted that one or more 50 b...","[{'id': 31, 'from_id': 5933, 'to_id': 13, 'typ...","[{'id': 13, 'start_offset': 268, 'end_offset':...",[[]]
3,4,The FOMC meets eight times a year todiscuss mo...,"[{'id': 15, 'from_id': 23, 'to_id': 36, 'type'...","[{'id': 23, 'start_offset': 4, 'end_offset': 8...",[[]]
4,5,The 12 members of theFOMCmeet eight times a ye...,"[{'id': 19, 'from_id': 38, 'to_id': 40, 'type'...","[{'id': 38, 'start_offset': 21, 'end_offset': ...",[[]]
...,...,...,...,...,...
564,565,TradeStation data shows that the S&P 500 Commu...,[],"[{'id': 5864, 'start_offset': 0, 'end_offset':...",[[]]
565,566,"Facebook, Netflix and Alphabet, parent of Goog...",[],"[{'id': 5876, 'start_offset': 0, 'end_offset':...",[[]]
566,567,Analysts do not expect a big market move when ...,[],"[{'id': 5887, 'start_offset': 0, 'end_offset':...",[[]]
567,568,The new communications services industry will ...,[],"[{'id': 5901, 'start_offset': 8, 'end_offset':...",[[]]


### Keep the labels on separately for processing later. 

In [7]:
# Keep a record of the labels separately for processing later. 
label_relation = df_entities[["id", "relations"]].values
label_entities = df_entities[["id", "entities"]].values

# Preview. 
label_relation[:5], label_entities[:5]

(array([[1,
         list([{'id': 21, 'from_id': 1, 'to_id': 5914, 'type': 'CAUSE'}, {'id': 22, 'from_id': 1, 'to_id': 5918, 'type': 'CAUSE'}])],
        [2,
         list([{'id': 23, 'from_id': 3, 'to_id': 4951, 'type': 'STATUS'}, {'id': 24, 'from_id': 3, 'to_id': 5919, 'type': 'STATUS'}, {'id': 29, 'from_id': 5932, 'to_id': 4952, 'type': 'OCCUR_AT'}, {'id': 30, 'from_id': 5921, 'to_id': 5932, 'type': 'OCCUR_AT'}])],
        [3,
         list([{'id': 31, 'from_id': 5933, 'to_id': 13, 'type': 'OCCUR_AT'}, {'id': 32, 'from_id': 5926, 'to_id': 5927, 'type': 'EFFECT'}, {'id': 33, 'from_id': 5936, 'to_id': 18, 'type': 'EFFECT'}, {'id': 34, 'from_id': 5936, 'to_id': 5934, 'type': 'OCCUR_AT'}])],
        [4,
         list([{'id': 15, 'from_id': 23, 'to_id': 36, 'type': 'OCCUR_AT'}, {'id': 39, 'from_id': 27, 'to_id': 28, 'type': 'ACT_ON'}, {'id': 40, 'from_id': 29, 'to_id': 5938, 'type': 'ACT_ON'}, {'id': 41, 'from_id': 30, 'to_id': 31, 'type': 'ACT_ON'}, {'id': 42, 'from_id': 30, 'to_id': 32

### Process sentence split and span. 

In [8]:
# Split by boundary between alphabets and non-alphabet. 
# Example (random; token) will be come [random, ; , token]. 
# This will not ignore white spaces. 
df_entities["token"] = df_entities["text"].str.split(r"\b") 

# Preview. 
df_entities

Unnamed: 0,id,text,relations,entities,label,token
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],"[, The, , FOMC, , has, , committed, , to, ..."
1,2,He noted that the FOMC made exceptional progre...,"[{'id': 23, 'from_id': 3, 'to_id': 4951, 'type...","[{'id': 3, 'start_offset': 18, 'end_offset': 2...",[[]],"[, He, , noted, , that, , the, , FOMC, , ..."
2,3,"""Many participants noted that one or more 50 b...","[{'id': 31, 'from_id': 5933, 'to_id': 13, 'typ...","[{'id': 13, 'start_offset': 268, 'end_offset':...",[[]],"["", Many, , participants, , noted, , that, ..."
3,4,The FOMC meets eight times a year todiscuss mo...,"[{'id': 15, 'from_id': 23, 'to_id': 36, 'type'...","[{'id': 23, 'start_offset': 4, 'end_offset': 8...",[[]],"[, The, , FOMC, , meets, , eight, , times,..."
4,5,The 12 members of theFOMCmeet eight times a ye...,"[{'id': 19, 'from_id': 38, 'to_id': 40, 'type'...","[{'id': 38, 'start_offset': 21, 'end_offset': ...",[[]],"[, The, , 12, , members, , of, , theFOMCme..."
...,...,...,...,...,...,...
564,565,TradeStation data shows that the S&P 500 Commu...,[],"[{'id': 5864, 'start_offset': 0, 'end_offset':...",[[]],"[, TradeStation, , data, , shows, , that, ..."
565,566,"Facebook, Netflix and Alphabet, parent of Goog...",[],"[{'id': 5876, 'start_offset': 0, 'end_offset':...",[[]],"[, Facebook, , , Netflix, , and, , Alphabet,..."
566,567,Analysts do not expect a big market move when ...,[],"[{'id': 5887, 'start_offset': 0, 'end_offset':...",[[]],"[, Analysts, , do, , not, , expect, , a, ..."
567,568,The new communications services industry will ...,[],"[{'id': 5901, 'start_offset': 8, 'end_offset':...",[[]],"[, The, , new, , communications, , services..."


In [9]:
# Expand the list of token into subsequent rows. 
df_token = df_entities.explode(column="token") 

# Preview. 
df_token

Unnamed: 0,id,text,relations,entities,label,token
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],The
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],FOMC
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],
...,...,...,...,...,...,...
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],at
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],CFRA


In [10]:
# Get the str length to calculate span (str indices) at the start and end later. 
# We need span to extract the entities. Other third-party NLP pipelines 
# also require spans as input. 
df_token["len"] = df_token["token"].str.len() 

# Get the span for each token for each sentence. 
df_token["span_end"] = df_token.groupby("id").agg(span_end=pd.NamedAgg(column="len", aggfunc=np.cumsum)) 
df_token["span_beg"] = df_token.groupby("id")["span_end"].shift(periods=1) 
df_token["span_beg"] = df_token["span_beg"].fillna(0.0).astype(int) 
df_token["span"] = df_token[["span_beg", "span_end"]].values.tolist() 

# Preview. 
df_token

Unnamed: 0,id,text,relations,entities,label,token,len,span_end,span_beg,span
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,0,0,0,"[0, 0]"
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],The,3,3,0,"[0, 3]"
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,1,4,3,"[3, 4]"
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],FOMC,4,8,4,"[4, 8]"
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,1,9,8,"[8, 9]"
...,...,...,...,...,...,...,...,...,...,...
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],,1,214,213,"[213, 214]"
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],at,2,216,214,"[214, 216]"
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],,1,217,216,"[216, 217]"
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],CFRA,4,221,217,"[217, 221]"


## Preprocess tokens and add in the entities. 

### Process the tokens. 

In [11]:
# Add new column to store the entities label. 
df_token["tag_entity"] = "O" 

# To track error for attached tokens when proceessing the labels. 
dict_sent_id_error = defaultdict(lambda: []) 
dict_sent_id_error_count = defaultdict(lambda: 0) 

In [12]:
# Store the processed tokens in the new dataframe instead to avoid overwriting. 
df_processed_token = pd.DataFrame() 

# Each (label_entities) contain [ sent_id: Int, entities:List[ Dict[Text, Any] ] ]. 
for label in label_entities: 
	sent_id, entities = tuple(label) 

	# Partition the sentence from other sentences to process 1 sentence at a time. 
	boo_sent_id = df_token["id"] == sent_id 
	df_process_sent = df_token.loc[boo_sent_id, :].copy() 

	# Use this to track the overlapping span. 
	span_tracking = set() 

	for entity in entities: 
		# Skip the processing if there's an overlap. It has already been processed. 
		start_offset, end_offset = entity["start_offset"], entity["end_offset"]

		# Track and ignore the overlapped span since the processing has been done. 
		if (start_offset in span_tracking) and ((end_offset - 1) in span_tracking): 
			dict_sent_id_error[sent_id].append({"sent_id": sent_id, "entity": entity}) 
			dict_sent_id_error_count[sent_id] += 1 
			continue 

		# Set conditions for filter. 
		# 1) Filter on sentences. 
		# 2) Filter spacing. 
		# 3) Find the token(s) in between the span range. 
		boo_spacing = df_process_sent["token"].str.strip() != "" 
		boo_span = df_process_sent["span_beg"].between(start_offset, end_offset, inclusive="left") 
		boo_conditions = boo_spacing & boo_span 

		# Get the # of token(s) after the filtering. 
		nrows = boo_conditions.sum() 

		# If can't detect the token(s), split the attached token into separate rows. 
		# Also record the error for debugging later. 
		if nrows == 0: 
			dict_sent_id_error[sent_id].append({"sent_id": sent_id, "entity": entity}) 
			dict_sent_id_error_count[sent_id] += 1
			
			# Since (start_offset) and (end_offset) cannot get the due to incorrect span, 
			# get the range of tokens between the incorrect start and end span and process them. 
			boo_upper_bound = df_process_sent["span_beg"] < end_offset
			boo_lower_bound = df_process_sent["span_end"] > start_offset 
			boo_conditions = boo_spacing & boo_upper_bound & boo_lower_bound 

			# For debugging. 
			print("SENT:", sent_id)
			print("TOKN:", df_process_sent.loc[boo_conditions, "token"]) 

			# The following algorithm only assumes 1 attached token. It only process 1 token or row. 
			# Example (TheFOMCmember). Not (TheFOMCmember, makeFFR), which will be 2 tokens or rows. 

			# Split attached token into separate tokens. Extract the key token within the 
			# attached token. The key token should be in the middle. Key token refers to 
			# the one not being labelled as "O". 
			span_beg, span_end = (
				df_process_sent.loc[boo_conditions, "span_beg"].values[0], 
				df_process_sent.loc[boo_conditions, "span_end"].values[0], 
			)
			token_attached = df_process_sent.loc[boo_conditions, "token"].values[0] 

			# Track the overlapping span. 
			span_tracking = span_tracking.union(set(i for i in range(span_beg, span_end))) 

			# Recalculate the span. 
			span_token_1st_end = start_offset - span_beg 
			span_token_2nd_end = span_token_1st_end + (end_offset - start_offset) 

			# Reassign the tokens. 
			token_attached_sep = [
				token_attached[:span_token_1st_end], 
				token_attached[span_token_1st_end:span_token_2nd_end], 
				token_attached[span_token_2nd_end:], 
			]

			# Reassign the start of the span (the indices). 
			token_span_beg_sep = [
				span_beg, 
				span_beg + span_token_1st_end, 
				span_beg + span_token_1st_end + (end_offset - start_offset), 
			]

			# Reassign the end of the span (the indices). 
			token_span_end_sep = [
				span_beg + span_token_1st_end, 
				span_beg + span_token_1st_end + (end_offset - start_offset), 
				span_end, 
			]

			# Need to do this so that you can add a list object together with the integers. 
			# The original dtype is interger. 
			df_process_sent.loc[boo_conditions, "span_beg"] = df_process_sent.loc[boo_conditions, "span_beg"].astype("object")
			df_process_sent.loc[boo_conditions, "span_end"] = df_process_sent.loc[boo_conditions, "span_end"].astype("object")

			# Need double square brackets due to (pandas) error. 
			# The outcome will be [[item, item, item]]. 
			df_process_sent.loc[boo_conditions, "token"] = [[token_attached_sep]] 
			df_process_sent.loc[boo_conditions, "span_beg"] = [[token_span_beg_sep]] 
			df_process_sent.loc[boo_conditions, "span_end"] = [[token_span_end_sep]] 

			# Need to do the (explode) twice since we have 2 square brackets. 
			cols_explode = ["token", "span_beg", "span_end"]
			df_process_sent = df_process_sent.explode(cols_explode).explode(cols_explode) 

		# Sort it to ensure the order is correct. 
		df_process_sent = df_process_sent.sort_values(by=["id", "span_beg", "span_end"], ascending=True) 

	# Consolidate all the processed tokens for the attached ones. 
	df_processed_token = pd.concat([df_processed_token, df_process_sent], axis="index") 

SENT: 5
TOKN: 4    theFOMCmeet
Name: token, dtype: object
SENT: 37
TOKN: 36    bondclimbed
Name: token, dtype: object
SENT: 38
TOKN: 37    noterose
Name: token, dtype: object
SENT: 57
TOKN: 56    latestinflationary
Name: token, dtype: object
SENT: 60
TOKN: 59    arecent
Name: token, dtype: object
SENT: 67
TOKN: 66    ofinflationthat
Name: token, dtype: object
SENT: 70
TOKN: 69    TheISM
Name: token, dtype: object
SENT: 70
TOKN: 69    andServices
Name: token, dtype: object
SENT: 70
TOKN: 69    BusinessManufacturing
Name: token, dtype: object
SENT: 72
TOKN: 71    themanufacturing
Name: token, dtype: object
SENT: 72
TOKN: 71    ISMReport
Name: token, dtype: object
SENT: 72
TOKN: 71    latestManufacturing
Name: token, dtype: object
SENT: 77
TOKN: 76    andinventories
Name: token, dtype: object
SENT: 80
TOKN: 79    BusinessManufacturing
Name: token, dtype: object
SENT: 87
TOKN: 86    latestServices
Name: token, dtype: object
SENT: 87
TOKN: 86    ISMReport
Name: token, dtype: object
SENT: 87

In [13]:
# Preview. 
df_processed_token

Unnamed: 0,id,text,relations,entities,label,token,len,span_end,span_beg,span,tag_entity
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,0,0,0,"[0, 0]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],The,3,3,0,"[0, 3]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,1,4,3,"[3, 4]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],FOMC,4,8,4,"[4, 8]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,1,9,8,"[8, 9]",O
...,...,...,...,...,...,...,...,...,...,...,...
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],,1,214,213,"[213, 214]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],at,2,216,214,"[214, 216]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],,1,217,216,"[216, 217]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],CFRA,4,221,217,"[217, 221]",O


### Update the token span. 

In [14]:
# Need to recalculate the str length after processing the attached tokens. 
# We probably will not use the (len) and (span) columns so we can skip this step also. 
df_processed_token["len"] = df_processed_token["token"].str.len() 
df_processed_token["span"] = df_processed_token[["span_beg", "span_end"]].values.tolist() 

# Preview. 
df_processed_token

Unnamed: 0,id,text,relations,entities,label,token,len,span_end,span_beg,span,tag_entity
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,0,0,0,"[0, 0]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],The,3,3,0,"[0, 3]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,1,4,3,"[3, 4]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],FOMC,4,8,4,"[4, 8]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,1,9,8,"[8, 9]",O
...,...,...,...,...,...,...,...,...,...,...,...
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],,1,214,213,"[213, 214]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],at,2,216,214,"[214, 216]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],,1,217,216,"[216, 217]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],CFRA,4,221,217,"[217, 221]",O


In [15]:
# Check which sentences contain attached tokens for the ones 
# not labelled as "O". 
dict_sent_id_error_count 

defaultdict(<function __main__.<lambda>()>,
            {5: 2,
             37: 1,
             38: 1,
             57: 1,
             60: 1,
             67: 1,
             70: 3,
             72: 3,
             77: 1,
             80: 1,
             87: 4,
             106: 1,
             115: 1,
             124: 1,
             128: 1,
             131: 2,
             135: 1,
             136: 1,
             143: 1,
             154: 1,
             155: 1,
             161: 2,
             178: 1,
             180: 1,
             182: 2,
             187: 1,
             193: 1,
             195: 1,
             202: 1,
             207: 4,
             210: 2,
             223: 1,
             228: 1,
             229: 1,
             234: 1,
             300: 2,
             451: 1})

In [16]:
# For debugging only. Change the ID to check specific sentence. 
df_processed_token.loc[df_processed_token["id"] == 5, ["entities", "token", "len", "span_beg", "span_end", "span", "tag_entity"]].head(30) 

Unnamed: 0,entities,token,len,span_beg,span_end,span,tag_entity
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",,0,0,0,"[0, 0]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",The,3,0,3,"[0, 3]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",,1,3,4,"[3, 4]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",12,2,4,6,"[4, 6]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",,1,6,7,"[6, 7]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",members,7,7,14,"[7, 14]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",,1,14,15,"[14, 15]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",of,2,15,17,"[15, 17]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",,1,17,18,"[17, 18]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",the,3,18,21,"[18, 21]",O


### Add in the entities. 

In [17]:
# If you want to include BIO in the labels, use these variables. 
dict_bio = defaultdict(lambda: "I-") 
dict_bio[0] = "B-" 

# Each (label_entities) contain [ sent_id: Int, entities:List[ Dict[Text, Any] ] ]. 
for label in label_entities: 
	sent_id, entities = tuple(label) 

	for entity in entities: 
		start_offset, end_offset = entity["start_offset"], entity["end_offset"]

		# Set conditions for filtering. 
		# 1) Filter on sentences. 
		# 2) Filter spacing. 
		# 3) Find the token(s) in between the span range. 
		boo_sent_id = df_processed_token["id"] == sent_id 
		boo_spacing = df_processed_token["token"].str.strip() != "" 
		boo_span = df_processed_token["span_beg"].between(start_offset, end_offset, inclusive="left") 
		boo_conditions = boo_sent_id & boo_spacing & boo_span 

		# Get the # of token(s). 
		nrows = boo_conditions.sum() 

		# If the length of the token is 1, simply assign the entity tag. 
		if nrows == 1: 
			df_processed_token.loc[boo_conditions, "tag_entity"] = f'''{dict_bio[0]}{entity["label"]}''' 

		# If there are 2 or more tokens, assign the same entity tag to each of them. 
		elif nrows > 1: 
			spaces = "" 
			tokens = df_processed_token.loc[boo_conditions, "token"].values

			tag_entity = [f'''{dict_bio[i]}{entity["label"]}''' for i, t in enumerate(tokens) if t.strip() != spaces] 
			df_processed_token.loc[boo_conditions, "tag_entity"] = tag_entity 


# Preview. 
df_processed_token

Unnamed: 0,id,text,relations,entities,label,token,len,span_end,span_beg,span,tag_entity
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,0,0,0,"[0, 0]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],The,3,3,0,"[0, 3]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,1,4,3,"[3, 4]",O
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],FOMC,4,8,4,"[4, 8]",ORG
0,1,"The FOMC has committed to using rates, not ass...","[{'id': 21, 'from_id': 1, 'to_id': 5914, 'type...","[{'id': 1, 'start_offset': 4, 'end_offset': 8,...",[[]],,1,9,8,"[8, 9]",O
...,...,...,...,...,...,...,...,...,...,...,...
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],,1,214,213,"[213, 214]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],at,2,216,214,"[214, 216]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],,1,217,216,"[216, 217]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],CFRA,4,221,217,"[217, 221]",ORG


In [18]:
# For debugging only. Change the ID to check specific sentence. 
df_processed_token.loc[df_processed_token["id"] == 5, ["entities", "token", "len", "span_beg", "span_end", "span", "tag_entity"]].head(30) 

Unnamed: 0,entities,token,len,span_beg,span_end,span,tag_entity
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",,0,0,0,"[0, 0]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",The,3,0,3,"[0, 3]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",,1,3,4,"[3, 4]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",12,2,4,6,"[4, 6]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",,1,6,7,"[6, 7]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",members,7,7,14,"[7, 14]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",,1,14,15,"[14, 15]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",of,2,15,17,"[15, 17]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",,1,17,18,"[17, 18]",O
4,"[{'id': 38, 'start_offset': 21, 'end_offset': ...",the,3,18,21,"[18, 21]",O


### Remove white spaces. 

In [19]:
# Remove rows that contain white spaces. We don't need them for model training. 
# We didn't remove them at the start because we need to count them in the 
# start and end of span. Notice that we don't change the span even after we 
# remove the white spaces. 
df_final_output = df_processed_token[df_processed_token["token"].str.strip() != ""]

# Preview. 
df_final_output.tail(30)

Unnamed: 0,id,text,relations,entities,label,token,len,span_end,span_beg,span,tag_entity
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],",",2,66,64,"[64, 66]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],up,2,68,66,"[66, 68]",STAT
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],sharply,7,76,69,"[69, 76]",STAT
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],from,4,81,77,"[77, 81]",STAT
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],the,3,85,82,"[82, 85]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],lower,5,91,86,"[86, 91]",STAT
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],growth,6,98,92,"[92, 98]",STAT
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],",",2,100,98,"[98, 100]",O
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],value,5,105,100,"[100, 105]",STAT
568,569,The forward multiple on the new sector could b...,[],"[{'id': 4306, 'start_offset': 100, 'end_offset...",[[]],-,1,106,105,"[105, 106]",STAT


In [20]:
cols = ['id','text','token','span_beg','span_end','span','tag_entity']

df_final_output[cols].to_csv(f"{DIR_DATASET}/NER_entities_labelled.csv", index=False) 