Leung Wai Liu <br>
JPMC <br>
July 25, 2022 <br>
Twitter Relabelling ALL AT ONCE

In [22]:
import pandas as pd
import numpy as np
import re
import wordninja
import spacy
from transformers import pipeline
from transformers import AutoTokenizer
from collections import Counter

In [23]:
nlp = spacy.load('en_core_web_lg')
nlp2 = spacy.load('en_core_web_lg')

In [24]:
combined_df = pd.read_csv('logs/combined_final.tsv', sep='\t')
combined_df

Unnamed: 0,Word,POS Label,NER Label,Tweet
0,poutine,PROPN,O,rt @user1787 : i asked siri to remind me about...
1,friday,PROPN,O,rt @user912 : you know it 's friday when the p...
2,@user444,PROPN,O,mikel obi & his girl olga diyachenko spend hol...
3,@user1598,PROPN,O,""" @user1598 : super junior ’s siwon transforms..."
4,gospel,PROPN,O,""" do n't move on from the gospel , move into t..."
...,...,...,...,...
1013,@user70,PROPN,O,rt @user2041 : @user70 happy birthday man ! le...
1014,@user660,PROPN,O,rt @user1157 : happy birthday @user660 ! url1209
1015,@user2046,PROPN,O,@user2046 further you do the swedish social se...
1016,@user2242,PROPN,O,@user2046 further you do the swedish social se...


In [25]:
# Changing all @user to be B-PER for now
new_label = combined_df['NER Label'].tolist()
new_pos = combined_df['POS Label'].tolist()

for index, row in combined_df.iterrows():
    # retrieving the word of a particular row
    particular_word = row['Word']
    
    # if a particular word is a user tag - label it has a person
    if '@user' in particular_word:
        new_label[index] = 'B-PER'
        new_pos[index] = 'User: No change'
    else:
        # if there is a hashtag in a particular word, cut out the hashtag to do further processing
        if '#' in particular_word:
            particular_word = particular_word[1:]

        # breaking the word into phrases with Viterbo splitting (WordNinja)
        encode = wordninja.split(particular_word)
        encode = ' '.join(encode)

        # encoding into spacy document to get the NER entity of each word
        split_word_phrase = nlp(encode) # splitted word phrase
        combined_word_phrase = nlp2(particular_word) # original combined word phrase
        split_word_entities = [(e.text, e.ent_iob_, e.ent_type_, e.pos_) for e in split_word_phrase]
        combined_word_entities = [(f.text, f.ent_iob_, f.ent_type_, f.pos_) for f in combined_word_phrase]

        # retrieving all the labels and pos tags from each phrase (if any)
        label = []
        pos = []

        # if the combined word entity yielded an actual label, append it to list
        if combined_word_entities[0][2] != '':
            label.append(combined_word_entities[0][2])
        
        # otherwise append all of the NER tags from the split word entity (if there are)
        for numents in range(len(split_word_entities)):
            if split_word_entities[numents][2] != '':
                label.append(split_word_entities[numents][2])
            pos.append(split_word_entities[numents][3])
        

        # Filtering - if the label list is not empty: 
        if label != []:
            # Find the most common label tagging
            label_counter = Counter(label)
            label_most_common = (label_counter.most_common(1))[0][0]
            
            # checking the label itself and then labeling it based on the Tweebank labelling standard
            if label_most_common == 'PERSON':
                new_label[index] = 'B-PER'
            elif label_most_common == 'FAC':
                new_label[index] = 'B-LOC'
            elif label_most_common == 'NORP':
                new_label[index] = 'B-ORG'
            elif label_most_common == 'ORG':
                new_label[index] = 'B-ORG'
            elif label_most_common != 'MONEY' or label_most_common != 'QUANTITY':
                new_label[index] = 'B-MISC'

            pos_counter = Counter(pos)
            pos_most_common = (pos_counter.most_common(1)[0][0])
            new_pos[index] = pos_most_common
            
        else: 
            # If label list is empty, Looking through pos taggings to see if we can classify anything
            # if there is a proper noun, then it can still be classified as miscellaneous
            if 'PROPN' in pos: 
                new_label[index] = 'B-MISC'
                new_pos[index] = 'PROPN'
            elif 'NOUN' in pos:
                new_label[index] = 'B-MISC'
                new_pos[index] = 'NOUN'
            else:
                # if it is not able to find a label, then we find what SPACY thinks the POS tagging is
                pos_counter = Counter(pos)
                pos_most_common = (pos_counter.most_common(1)[0][0])
                new_pos[index] = pos_most_common
 


In [26]:
new_df = combined_df
new_df['Spacy POS Label'] = new_pos
new_df['Spacy NER Label'] = new_label

new_df = new_df.reindex(columns = ['Word', 'POS Label', 'NER Label', 'Spacy POS Label', 'Spacy NER Label', 'Tweet'])
new_df

Unnamed: 0,Word,POS Label,NER Label,Spacy POS Label,Spacy NER Label,Tweet
0,poutine,PROPN,O,NOUN,B-MISC,rt @user1787 : i asked siri to remind me about...
1,friday,PROPN,O,PROPN,B-MISC,rt @user912 : you know it 's friday when the p...
2,@user444,PROPN,O,User: No change,B-PER,mikel obi & his girl olga diyachenko spend hol...
3,@user1598,PROPN,O,User: No change,B-PER,""" @user1598 : super junior ’s siwon transforms..."
4,gospel,PROPN,O,PROPN,B-MISC,""" do n't move on from the gospel , move into t..."
...,...,...,...,...,...,...
1013,@user70,PROPN,O,User: No change,B-PER,rt @user2041 : @user70 happy birthday man ! le...
1014,@user660,PROPN,O,User: No change,B-PER,rt @user1157 : happy birthday @user660 ! url1209
1015,@user2046,PROPN,O,User: No change,B-PER,@user2046 further you do the swedish social se...
1016,@user2242,PROPN,O,User: No change,B-PER,@user2046 further you do the swedish social se...


In [27]:
new_df.to_csv('logs/relabelled_data_spacy_pass.tsv', sep='\t', index=False)