In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm, trange

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
orig_df = pd.read_json('tweet_reply.json', lines=True)
orig_df = orig_df.sample(frac=1).reset_index(drop=True) # shuffle records
orig_df.info()
orig_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170255 entries, 0 to 170254
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   keyword      170255 non-null  object
 1   main_tweet   170255 non-null  object
 2   main_likes   170255 non-null  int64 
 3   reply        170255 non-null  object
 4   reply_likes  170255 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 6.5+ MB


Unnamed: 0,keyword,main_tweet,main_likes,reply,reply_likes
0,Bitcoin,Whales and elites stepped in on this day to sc...,29,Freedom✊🏼 #wearesatoshi,1
1,nba,"Hang on, you are the queen of cancel culture t...",1,In the same hour…this wretched troll posted this…,1
2,Tesla,It just smells like dead people burning in his...,0,Why do I have a feeling this smells like fire&...,0
3,Tesla,Sooooo many that I’m having a hard time findin...,0,"chick fil a ,home depot anyone associated with...",0
4,Tesla,"I will, if you guarantee me a job at tesla 😉",0,What does it matter if you buy Twitter?,0


In [6]:
# SPLIT DATASET[commenting this out as I have saved the split datasets as csv. Will use those from here on.]
# train, test = train_test_split(orig_df, test_size=0.3)
# test.reset_index(drop=True)
# train.reset_index(drop=True)

test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

print("Test Dataset:")
test.info()
print("\nTraining Dataset:")
train.info()
#test["reply"].values.size

#test["keyword"].values[0] -> 'weather'

Test Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51077 entries, 0 to 51076
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   keyword      51077 non-null  object
 1   main_tweet   51077 non-null  object
 2   main_likes   51077 non-null  int64 
 3   reply        51077 non-null  object
 4   reply_likes  51077 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.9+ MB

Training Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119178 entries, 0 to 119177
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   keyword      119178 non-null  object
 1   main_tweet   119178 non-null  object
 2   main_likes   119178 non-null  int64 
 3   reply        119178 non-null  object
 4   reply_likes  119178 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 4.5+ MB


In [7]:
#setup pre-trained roberta model from https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest 
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
# Run for Roberta Model
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

print(polarity_scores_roberta("I am happy"))

{'roberta_neg': 0.0077943704, 'roberta_neu': 0.030709133, 'roberta_pos': 0.96149653}


In [6]:
# Run Roberta on test data to get analysis prior leet substitution
test1_result = {'keyword':[], 'reply':[], 'Neg':[], 'Neu':[], 'Pos':[]}

for i in trange(len(test['reply'].values)):
    test1_result['keyword'].append(test['keyword'].values[i])
    test1_result['reply'].append(test['reply'].values[i])

    polarity_scores = polarity_scores_roberta(test['reply'].values[i])
    test1_result['Neg'].append(polarity_scores['roberta_neg'])
    test1_result['Neu'].append(polarity_scores['roberta_neu'])
    test1_result['Pos'].append(polarity_scores['roberta_pos'])

test1_result_df = pd.DataFrame(test1_result)
test1_result_df.head()

100%|██████████| 51077/51077 [1:57:32<00:00,  7.24it/s]      


Unnamed: 0,keyword,reply,Neg,Neu,Pos
0,Dogecoin,Hodling like there is no tomorrow!,0.061756,0.309112,0.629132
1,Vaccine,As they tell us to not worry. 👀,0.078209,0.717959,0.203832
2,World Cup,I love this,0.009743,0.039927,0.95033
3,COVID-19,"I hope so, but i have no need of immediate inf...",0.015798,0.124445,0.859756
4,COVID-19,Nuremberg doctors' trial was an important mile...,0.005117,0.10639,0.888493


In [10]:
# test.to_csv('test.csv', index=False)
# train.to_csv('train.csv', index=False)
# test1_result_df.to_csv('test1_result.csv', index=False)


# LeetSpeak processing

In [12]:
#leetWord processing
#from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
import pandas as pd
import re
import emoji

MAX_LEET_WORD_SIZE = 15 # word length
# load common words
commonWords_df = pd.read_csv('commonWords.csv')

# dictionary for possible leet substitutions with frequency/count {'A': {}, 'B': {},...,'Z': {}}
leetDict = {} 

def setupLeetDict():

    for asc in range(ord('A'), ord('Z')+1):
        leetDict[chr(asc)] = {}

def getWordFrequencyPercentile(percentile):
    return commonWords_df.quantile(percentile, numeric_only=True).values[0]

def getWordFrequency(word):
    wordFrequency = 0
    if word in commonWords_df['word'].values:
        wordRowIndex = commonWords_df.index[commonWords_df['word']==word].tolist()[0]
        wordFrequency = commonWords_df['count'].loc[commonWords_df.index[wordRowIndex]]
    return wordFrequency
    
def getLeetWordList(inputStr):
    regex = ".*[a-zA-Z].*" # regex for string containing at least 1 alphabet, to filter just numbers/special character/emojis tokens
    # tokenList = word_tokenize(inputStr)
    demojized = emoji.replace_emoji(inputStr, replace='') # remove emojis
    tokenList = demojized.split(" ")
    leetList = []
    #print(emoji.replace_emoji('hi🤔.', replace=''))
    for word in tokenList:
        word = word.strip(".,?!#@") # strip common leading/trailing punctuations, hashtags, tag usernames
        if not word.isalpha() and re.search(regex, word) and len(word) > 1 and len(word) <= MAX_LEET_WORD_SIZE:
            leetList.append(word)

    return leetList # returns the list of leetwords in a given text body

def getMatchList(leetWord):
    # get candidates of equal length
    spell = SpellChecker()
    candidateList = spell.candidates(leetWord)
    commonCands = []
    if candidateList is not None:
        for word in candidateList:
            if getWordFrequency(word) >= getWordFrequencyPercentile(0.70):# and len(word) == len(leetWord):
                commonCands.append(word)

    return commonCands # e.g: ['hello', 'hella', 'hells'] for hell0


def getPossibleSubstitutions(leetWord, candidates):
    # candidates = getMatchList(leetWord)
    # print(leetWord)
    # print(candidates)
    substitutions = {}
    
    for word in candidates:
        charPos = 0
        if len(word) == len(leetWord):       #count substitutions only for same length words/cands 
            for char in leetWord:
                if not char.isalpha():
                    subAlpha = word[charPos].upper()
                    if subAlpha not in substitutions:
                        substitutions[subAlpha] = {char:1}
                    else:
                        if char not in substitutions[subAlpha]:
                            substitutions[subAlpha][char] = 1
                        else:
                            substitutions[subAlpha][char] += 1

                charPos += 1

    return substitutions # e.g: {'A': {'3',1}, 'E': {'3',1}} 

# for a given leet word , add the possible leet substituions to the global leet dictionary
def updateLeetDict(subs):
    # subs = getPossibleSubstitutions(leetWord)
    for key in subs:
        for leetChar in subs[key]:
            if leetChar in leetDict[key]: # if '3' is in list of 'E'
                leetDict[key][leetChar] += subs[key][leetChar]
            else:
                leetDict[key][leetChar] = subs[key][leetChar]

def processTextInput(textInput): # per row processing: get leetWords, for each leetWord get matches, for each match get context and update leetDict
    leetWordList = getLeetWordList(textInput)
    print("LeetWord List:" + str(leetWordList))
    for lword in leetWordList:
        possibleMatches = getMatchList(lword)
        possibleSubs = getPossibleSubstitutions(lword, possibleMatches)
        updateLeetDict(possibleSubs)
        print("LeetWord: " + lword)
        print("Candidate list: " + str(possibleMatches))
        print("Possible Substitutions and counts: " + str(possibleSubs))


# processTextInput("Wow. Mr. Bezos replied to the founder of $doge. Much wow!#crypto #doge")
# print("Final Leet Dictionary: " + str(leetDict))

testStr = "He11o W0rld !!"

setupLeetDict()     

processTextInput(testStr)
print("Final Leet Dictionary: " + str(leetDict))

LeetWord List:['He11o', 'W0rld']
LeetWord: He11o
Candidate list: ['hello', 'hero', 'hecho']
Possible Substitutions and counts: {'L': {'1': 2}, 'C': {'1': 1}, 'H': {'1': 1}}
LeetWord: W0rld
Candidate list: ['world']
Possible Substitutions and counts: {'O': {'0': 1}}
Final Leet Dictionary: {'A': {}, 'B': {}, 'C': {'1': 1}, 'D': {}, 'E': {}, 'F': {}, 'G': {}, 'H': {'1': 1}, 'I': {}, 'J': {}, 'K': {}, 'L': {'1': 2}, 'M': {}, 'N': {}, 'O': {'0': 1}, 'P': {}, 'Q': {}, 'R': {}, 'S': {}, 'T': {}, 'U': {}, 'V': {}, 'W': {}, 'X': {}, 'Y': {}, 'Z': {}}


In [128]:
# SMALL TEST WITH LEET CODE: Run Roberta and leet decode algorithm on train data to get analysis post leet substitution
testLeet = {'keyword':[], 'reply':[], 'leetWords':[], 'candidates':[], 'Neg':[], 'Neu':[], 'Pos':[]}

for i in trange(1000):
    testLeet['keyword'].append(test['keyword'].values[i])
    testLeet['reply'].append(test['reply'].values[i])

    leetWords = getLeetWordList(test['reply'].values[i])
    testLeet['leetWords'].append(leetWords)
    candidates = []
    for lword in leetWords:
        possibleMatches = getMatchList(lword)
        candidates.append(possibleMatches)
        possibleSubs = getPossibleSubstitutions(lword, possibleMatches)
        updateLeetDict(possibleSubs)
    testLeet['candidates'].append(candidates)


    polarity_scores = polarity_scores_roberta(test['reply'].values[i])
    testLeet['Neg'].append(polarity_scores['roberta_neg'])
    testLeet['Neu'].append(polarity_scores['roberta_neu'])
    testLeet['Pos'].append(polarity_scores['roberta_pos'])



100%|██████████| 1000/1000 [06:05<00:00,  2.73it/s]


In [129]:
testLeet_df = pd.DataFrame(testLeet)
testLeet_df.head()

Unnamed: 0,keyword,reply,leetWords,candidates,Neg,Neu,Pos
0,Dogecoin,Hodling like there is no tomorrow!,[],[],0.061756,0.309112,0.629132
1,Vaccine,As they tell us to not worry. 👀,[],[],0.078209,0.717959,0.203832
2,World Cup,I love this,[],[],0.009743,0.039927,0.95033
3,COVID-19,"I hope so, but i have no need of immediate inf...",[exploration.May],[[]],0.015798,0.124445,0.859756
4,COVID-19,Nuremberg doctors' trial was an important mile...,[doctors'],[[doctors]],0.005117,0.10639,0.888493


In [130]:
testLeet_df['reply'].values[3]

'I hope so, but i have no need of immediate information, thank you Elon, you know the value of exploration.May the force be with you, Monsieur Cohagen Marsk😊.'

In [131]:
# testLeet_df.query('leetWords ')
# testLeet_df[testLeet_df['leetWords'].size >0]
# len(testLeet_df['leetWords'].values)

#df['CreationDate'].str.len() -> gives length of arrays stored in cell https://stackoverflow.com/questions/41340341/how-to-determine-the-length-of-lists-in-a-pandas-dataframe-column
testLeet_df[testLeet_df['leetWords'].str.len() >0] #just the rows with leetspeak

Unnamed: 0,keyword,reply,leetWords,candidates,Neg,Neu,Pos
3,COVID-19,"I hope so, but i have no need of immediate inf...",[exploration.May],[[]],0.015798,0.124445,0.859756
4,COVID-19,Nuremberg doctors' trial was an important mile...,[doctors'],[[doctors]],0.005117,0.106390,0.888493
5,Elon Musk,"As J-Lo says, ""I'm real...""","[J-Lo, ""I'm, real...""]","[[silo, jell, joo, jal, jello, jojo, julio, cu...",0.006225,0.546022,0.447753
7,nba,"Let’s goooo, Kid Kessler!",[Let’s],[[lets]],0.003371,0.049731,0.946898
8,Vaccine,"It was the State colluding with Healthcare, La...",[force/coerce],[[]],0.817865,0.170145,0.011990
...,...,...,...,...,...,...,...
994,Queen Elizabeth,"Shit, you had to be tough back then in today's...",[today's],[[]],0.659633,0.275425,0.064942
995,nba,2nd Stint Cavs LeBron,[2nd],"[[und, ond, end, ind, and]]",0.017023,0.867033,0.115944
996,Dogecoin,Wow. Mr. Bezos replied to the founder of $doge...,"[$doge, wow!#crypto]","[[doge], []]",0.002435,0.026569,0.970996
997,World Cup,A part of me wishes for the web shooters to st...,"[I'd, Man's, Cap's]","[[], [], []]",0.075297,0.514504,0.410199


In [132]:
leetDict["E"]

{'-': 2,
 ':': 4,
 '(': 3,
 '”': 5,
 ',': 1,
 '3': 10,
 '…': 18,
 '9': 2,
 '5': 4,
 ';': 4,
 '6': 35,
 '"': 4,
 '/': 20,
 '*': 3,
 '?': 5,
 '7': 11,
 '’': 22,
 '!': 1,
 '1': 32,
 '.': 1,
 '2': 2,
 ')': 1,
 '£': 11,
 '0': 17,
 '“': 1}

In [14]:
train1, train2 = train_test_split(train, test_size=0.50)
train1.reset_index(drop=True)
train2.reset_index(drop=True)
print("Training Dataset1:")
train1.info()
print("\nTraining Dataset2:")
train2.info()

Training Dataset1:
<class 'pandas.core.frame.DataFrame'>
Index: 59589 entries, 98369 to 41792
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   keyword      59589 non-null  object
 1   main_tweet   59589 non-null  object
 2   main_likes   59589 non-null  int64 
 3   reply        59589 non-null  object
 4   reply_likes  59589 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 2.7+ MB

Training Dataset2:
<class 'pandas.core.frame.DataFrame'>
Index: 59589 entries, 116618 to 26427
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   keyword      59589 non-null  object
 1   main_tweet   59589 non-null  object
 2   main_likes   59589 non-null  int64 
 3   reply        59589 non-null  object
 4   reply_likes  59589 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 2.7+ MB


In [18]:
# Run leet algorithm on train data to extract leet words and their substitutions
setupLeetDict() # initiate FP list
trainLeet1 = {'keyword':[], 'reply':[], 'leetWords':[], 'candidates':[], 'Neg':[], 'Neu':[], 'Pos':[]}

for i in trange(len(train1['reply'].values)):
    trainLeet1['keyword'].append(train1['keyword'].values[i])
    trainLeet1['reply'].append(train1['reply'].values[i])

    leetWords = getLeetWordList(train1['reply'].values[i])
    trainLeet1['leetWords'].append(leetWords)
    candidates = []
    for lword in leetWords:
        possibleMatches = getMatchList(lword)
        candidates.append(possibleMatches)
        possibleSubs = getPossibleSubstitutions(lword, possibleMatches)
        updateLeetDict(possibleSubs)
    trainLeet1['candidates'].append(candidates)


    polarity_scores = polarity_scores_roberta(train1['reply'].values[i])
    trainLeet1['Neg'].append(polarity_scores['roberta_neg'])
    trainLeet1['Neu'].append(polarity_scores['roberta_neu'])
    trainLeet1['Pos'].append(polarity_scores['roberta_pos'])

train1_result_df = pd.DataFrame(trainLeet1)
train1_result_df.head()

100%|██████████| 59589/59589 [4:41:39<00:00,  3.53it/s]   


Unnamed: 0,keyword,reply,leetWords,candidates,Neg,Neu,Pos
0,NFT,I’m just saying - this right here is so fuckin...,"[I’m, early.Currently, 420.69tz, 8PM, 1ktz?Do,...","[[iam, im, ihm, ism, ibm], [], [], [um, kam, g...",0.614322,0.245399,0.14028
1,Ukraine,Find the millions/ billions the pentagon lost ...,[millions/],[[millions]],0.827582,0.159389,0.013029
2,Elon Musk,I’m real.,[I’m],"[[iam, im, ihm, ism, ibm]]",0.023762,0.491757,0.484482
3,Bitcoin,What is everyone pitching for Thanksgiving thi...,"[$COLD2019:, $FB2020:, $IAC2021:, $KRBN2022:, ...","[[], [], [], [], []]",0.00813,0.747924,0.243946
4,Dogecoin,Word.,[],[],0.084244,0.762295,0.153461


In [20]:
#collect training results - leet dict and the result table
import json
with open("train1_leetDict.json", "w") as outfile: 
    json.dump(leetDict, outfile)

In [35]:
import numpy as np
leetDict['B']

allKeysCounts = []
allKeys = []
for key in leetDict["B"].keys():
    allKeys.append(key)
    allKeysCounts.append(leetDict["B"][key])
    
    #print(key, leetDict["F"][key])
f_set = np.array(leetDict["B"])
print(f_set)

# f_set_df = pd.DataFrame()

{'’': 1037, '8': 132, '?': 2, '*': 49, '(': 259, '7': 72, '0': 163, '3': 282, '$': 443, '"': 201, '2': 317, '‘': 41, ')': 78, '“': 151, '1': 468, '5': 167, '-': 38, ':': 88, '…': 208, '.': 60, ',': 1, '⁵': 11, '9': 95, '4': 108, '6': 60, "'": 45, '{': 1, '£': 8, ';': 6, '!': 2, '»': 2, '+': 18, '\xa0': 3, '/': 14, '”': 5, '=': 3, '²': 10, '⁷': 6, '⁸': 6, '`': 1, '[': 12, '_': 1, '—': 10, '^': 7, ']': 1}


In [38]:
# train1_result_df.to_csv('train1_result.csv', index=False)
# train1.to_csv('train1.csv', index=False)
train1_result_df.head()


Unnamed: 0,keyword,reply,leetWords,candidates,Neg,Neu,Pos
0,NFT,I’m just saying - this right here is so fuckin...,"[I’m, early.Currently, 420.69tz, 8PM, 1ktz?Do,...","[[iam, im, ihm, ism, ibm], [], [], [um, kam, g...",0.614322,0.245399,0.14028
1,Ukraine,Find the millions/ billions the pentagon lost ...,[millions/],[[millions]],0.827582,0.159389,0.013029
2,Elon Musk,I’m real.,[I’m],"[[iam, im, ihm, ism, ibm]]",0.023762,0.491757,0.484482
3,Bitcoin,What is everyone pitching for Thanksgiving thi...,"[$COLD2019:, $FB2020:, $IAC2021:, $KRBN2022:, ...","[[], [], [], [], []]",0.00813,0.747924,0.243946
4,Dogecoin,Word.,[],[],0.084244,0.762295,0.153461
