In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm, trange
import numpy as np
import json


In [4]:
# Load leet dictionary/frequent pattern with frequency
import LeetMining as lm
leet_dict = {}
 
# Opening JSON file
with open('twitter_leetDict_trial2.json') as json_file:
    leet_dict = json.load(json_file)
leet_dict

{'A': {':': 969,
  '1': 3934,
  '/': 401,
  '2': 2548,
  '0': 7179,
  '(': 571,
  '5': 2972,
  ')': 540,
  '$': 981,
  '8': 1091,
  '3': 1889,
  '6': 620,
  '*': 481,
  '9': 1480,
  '-': 447,
  '7': 728,
  '4': 1154,
  '.': 706,
  ';': 805,
  '£': 13,
  '[': 31,
  '«': 136,
  '\xa0': 207,
  '=': 27,
  '{': 2,
  '–': 1,
  '!': 130,
  '+': 62,
  '¿': 12,
  '²': 52,
  '⁷': 18,
  '⁸': 18,
  '_': 11,
  '☭': 7,
  '—': 51,
  '»': 3,
  '#': 19,
  '\u202f': 1,
  '„': 4,
  '￼': 1,
  '^': 14,
  '🖒': 3,
  '%': 8,
  '→': 1,
  '？': 1,
  '~': 17,
  ']': 19,
  '。': 1,
  '¡': 1,
  '°': 3,
  '⁵': 18,
  '⁶': 18,
  '\u3000': 1,
  '↓': 1,
  '•': 1,
  '\u200b': 12},
 'B': {'2': 695,
  '0': 576,
  '(': 540,
  '/': 16,
  '$': 828,
  '1': 917,
  ':': 195,
  '3': 552,
  '5': 346,
  '9': 404,
  '7': 222,
  '=': 5,
  '*': 68,
  '6': 146,
  ')': 150,
  '8': 317,
  ';': 201,
  '⁵': 29,
  '[': 14,
  '4': 287,
  '.': 250,
  '\xa0': 39,
  '-': 56,
  '£': 12,
  '+': 20,
  '￼': 1,
  '`': 1,
  '!': 25,
  '⁷': 6,
  '²': 2

In [5]:
# load test data

test = pd.read_csv("test.csv")
print("Test Dataset:")
test.info()

#test["reply"].values.size

#test["keyword"].values[0] -> 'weather'

Test Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51077 entries, 0 to 51076
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   keyword      51077 non-null  object
 1   main_tweet   51077 non-null  object
 2   main_likes   51077 non-null  int64 
 3   reply        51077 non-null  object
 4   reply_likes  51077 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.9+ MB


In [6]:
#setup pre-trained roberta model from https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest 
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Run for Roberta Model
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

print(polarity_scores_roberta("I am happy"))

{'roberta_neg': 0.0077943704, 'roberta_neu': 0.030709133, 'roberta_pos': 0.96149653}


# LeetSpeak processing

In [14]:
# Use the existing leetDict to get substitutions and leet transformations for new reply

test2_result = {'keyword':[], 'reply':[], 'leetWords':[], 'BestMatches':[], 'NewReply':[],'Neg':[], 'Neu':[], 'Pos':[]}

for i in trange(len(test['reply'].values)):
    test2_result['keyword'].append(test['keyword'].values[i])
    test2_result['reply'].append(test['reply'].values[i])

    leetWords = lm.getLeetWordList(test['reply'].values[i])
    test2_result['leetWords'].append(leetWords)
    bestMatches = []
    newReply = test['reply'].values[i]

    for lword in leetWords:
        bestMatch = lm.getBestMatch(lword,leet_dict)
        bestMatches.append(bestMatch)
        if bestMatch is not None:
            newReply = lm.replaceLeet(newReply, lword, bestMatch)
    test2_result['BestMatches'].append(bestMatches)

    newReply = "".join(newReply)
    test2_result['NewReply'].append(newReply)

    polarity_scores = polarity_scores_roberta(newReply)
    test2_result['Neg'].append(polarity_scores['roberta_neg'])
    test2_result['Neu'].append(polarity_scores['roberta_neu'])
    test2_result['Pos'].append(polarity_scores['roberta_pos'])

test2_result_df = pd.DataFrame(test2_result)
test2_result_df.head()

100%|██████████| 51077/51077 [3:50:27<00:00,  3.69it/s]   


Unnamed: 0,keyword,reply,leetWords,BestMatches,NewReply,Neg,Neu,Pos
0,Dogecoin,Hodling like there is no tomorrow!,[],[],Hodling like there is no tomorrow!,0.061756,0.309112,0.629132
1,Vaccine,As they tell us to not worry. 👀,[],[],As they tell us to not worry. 👀,0.078209,0.717959,0.203832
2,World Cup,I love this,[],[],I love this,0.009743,0.039927,0.95033
3,COVID-19,"I hope so, but i have no need of immediate inf...",[exploration.May],[None],"I hope so, but i have no need of immediate inf...",0.015798,0.124445,0.859756
4,COVID-19,Nuremberg doctors' trial was an important mile...,[],[],Nuremberg doctors' trial was an important mile...,0.005117,0.10639,0.888493


In [15]:
#test2_result_df["NewReply"].values[3]
test2_result_df[test2_result_df['leetWords'].str.len() >0].info() #just the rows with leetspeak

<class 'pandas.core.frame.DataFrame'>
Index: 11356 entries, 3 to 51076
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   keyword      11356 non-null  object 
 1   reply        11356 non-null  object 
 2   leetWords    11356 non-null  object 
 3   BestMatches  11356 non-null  object 
 4   NewReply     11356 non-null  object 
 5   Neg          11356 non-null  float32
 6   Neu          11356 non-null  float32
 7   Pos          11356 non-null  float32
dtypes: float32(3), object(5)
memory usage: 665.4+ KB


In [16]:
test2_result_df[test2_result_df['leetWords'].str.len() >0]

Unnamed: 0,keyword,reply,leetWords,BestMatches,NewReply,Neg,Neu,Pos
3,COVID-19,"I hope so, but i have no need of immediate inf...",[exploration.May],[None],"I hope so, but i have no need of immediate inf...",0.015798,0.124445,0.859756
5,Elon Musk,"As J-Lo says, ""I'm real...""","[J-Lo, real...]","[solo, None]","As solo says, ""I'm real...""",0.008493,0.577800,0.413707
8,Vaccine,"It was the State colluding with Healthcare, La...",[force/coerce],[None],"It was the State colluding with Healthcare, La...",0.817865,0.170145,0.011990
10,Queen Elizabeth,1. They are still teaching about African Ameri...,"[that.2, agenda.3, slavery:]","[thatCH, agenda, slavery]",1. They are still teaching about African Ameri...,0.798799,0.194550,0.006651
11,Dogecoin,"With the respective eco-chambers, it both stre...",[eco-chambers],[None],"With the respective eco-chambers, it both stre...",0.053483,0.812268,0.134249
...,...,...,...,...,...,...,...,...
51063,Dogecoin,Ayeeee its my coin $shib #trending #SHIB,[$shib],[sahib],Ayeeee its my coin sahib #trending #SHIB,0.002712,0.093619,0.903670
51070,TikTok,meanwhile usher:,[usher:],[usherS],meanwhile usherS,0.077866,0.775107,0.147027
51071,World Cup,"Listen, I'd rather get wins and points However...",[track.Next],[None],"Listen, I'd rather get wins and points However...",0.013980,0.064325,0.921695
51073,Bitcoin,Reach out to the good guys at &amp;,[amp;],[ampS],Reach out to the good guys at ampS,0.021109,0.388510,0.590382


In [None]:
test2_result_df.to_csv('test2_result_trial2.csv', index=False)