Leung Wai Liu <br>
JPMorgan AI Research <br>
August 8, 2022 <br>
Twitter Relabelling WORD Val Data Processing 

In [1]:
from reading_datasets import read_ud_dataset, reading_tb_ner
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

In [2]:
pos_train = read_ud_dataset(dataset = 'tb', location = '../Datasets/POSTagging/Tweebank/', split = 'train')
pos_val = read_ud_dataset(dataset = 'tb', location = '../Datasets/POSTagging/Tweebank/', split = 'dev')
pos_test = read_ud_dataset(dataset = 'tb', location = '../Datasets/POSTagging/Tweebank/', split = 'test')
ner_train = reading_tb_ner(location = '../Datasets/NER/Tweebank/', split = 'train')
ner_val = reading_tb_ner(location = '../Datasets/NER/Tweebank/', split = 'dev')
ner_test = reading_tb_ner(location = '../Datasets/NER/Tweebank/', split = 'test')

#Getting all the tweets for validation dataset
combined_dataset_val = []
for i, (tweet, pos_labels) in enumerate(pos_val):
    tweet = [token.lower() for token in tweet]
    query = ' '.join(tweet)

    found = False
    for ner_tweet, ner_labels in ner_val:
        ner_tweet = [token.lower() for token in ner_tweet]
        check = ' '.join(ner_tweet)
        if query == check:
            found = True
            break

    if found and tweet == ner_tweet:
        combined_dataset_val.append([tweet, pos_labels, ner_labels])


In [3]:
combined_val_ner_per = []
combined_val_ner_org = []
combined_val_ner_loc = []
combined_val_ner_misc = []
combined_val_ner_o = []

for tweet, pos_labels, ner_labels in combined_dataset_val:
    for t, p, n in zip(tweet, pos_labels, ner_labels):
        if n == "O":
            combined_val_ner_o.append(t)
        else:
            new_n = n[2:]
            if new_n == "PER":
                combined_val_ner_per.append(t)
            elif new_n == "ORG":
                combined_val_ner_org.append(t)
            elif new_n == "LOC":
                combined_val_ner_loc.append(t)
            elif new_n == "MISC":
                combined_val_ner_misc.append(t)


In [4]:
val_ner_per_counter = Counter(combined_val_ner_per)

print(len(val_ner_per_counter))


187


In [5]:
per_counter_df = pd.DataFrame.from_dict(val_ner_per_counter, orient='index').reset_index()
per_counter_df = per_counter_df.rename(columns={'index': 'PER', 0: 'Count'})
per_counter_df = per_counter_df.sort_values(by="Count", ascending=False, ignore_index=True)
per_counter_df

Unnamed: 0,PER,Count
0,clinton,4
1,daniel,3
2,bieber,3
3,justin,3
4,rose,3
...,...,...
182,twins,1
183,vinny,1
184,curry,1
185,trumps,1


In [6]:
val_ner_org_counter = Counter(combined_val_ner_org)

print(len(val_ner_org_counter))

135


In [7]:
org_counter_df = pd.DataFrame.from_dict(val_ner_org_counter, orient='index').reset_index()
org_counter_df = org_counter_df.rename(columns={'index': 'ORG', 0: 'Count'})
org_counter_df = org_counter_df.sort_values(by="Count", ascending=False, ignore_index=True)
org_counter_df

Unnamed: 0,ORG,Count
0,the,3
1,nus,3
2,snapchat,3
3,twitter,3
4,eu,3
...,...,...
130,bbc,1
131,eurovision,1
132,nba,1
133,eagles,1


In [8]:
val_ner_loc_counter = Counter(combined_val_ner_loc)

print(len(val_ner_loc_counter))

80


In [9]:
loc_counter_df = pd.DataFrame.from_dict(val_ner_loc_counter, orient='index').reset_index()
loc_counter_df = loc_counter_df.rename(columns={'index': 'LOC', 0: 'Count'})
loc_counter_df = loc_counter_df.sort_values(by="Count", ascending=False, ignore_index=True)
loc_counter_df

Unnamed: 0,LOC,Count
0,new,4
1,uk,3
2,york,2
3,hollywood,2
4,nc,2
...,...,...
75,cal,1
76,nor,1
77,dublin,1
78,monts,1


In [10]:
val_ner_misc_counter = Counter(combined_val_ner_misc)

print(len(val_ner_misc_counter))

175


In [11]:
misc_counter_df = pd.DataFrame.from_dict(val_ner_misc_counter, orient='index').reset_index()
misc_counter_df = misc_counter_df.rename(columns={'index': 'MISC', 0: 'Count'})
misc_counter_df = misc_counter_df.sort_values(by="Count", ascending=False, ignore_index=True)
misc_counter_df

Unnamed: 0,MISC,Count
0,the,10
1,canadian,3
2,of,3
3,itunes,2
4,easter,2
...,...,...
170,iranian,1
171,islamic,1
172,8sideways,1
173,toxic,1


In [12]:
val_merged_together = pd.concat([per_counter_df, org_counter_df, loc_counter_df, misc_counter_df], axis=1)
val_merged_together

Unnamed: 0,PER,Count,ORG,Count.1,LOC,Count.2,MISC,Count.3
0,clinton,4,the,3.0,new,4.0,the,10.0
1,daniel,3,nus,3.0,uk,3.0,canadian,3.0
2,bieber,3,snapchat,3.0,york,2.0,of,3.0
3,justin,3,twitter,3.0,hollywood,2.0,itunes,2.0
4,rose,3,eu,3.0,nc,2.0,easter,2.0
...,...,...,...,...,...,...,...,...
182,twins,1,,,,,,
183,vinny,1,,,,,,
184,curry,1,,,,,,
185,trumps,1,,,,,,


In [13]:
val_merged_together.to_csv('logs/val_merged_together.tsv', sep='\t', index=False)