In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm, trange
import numpy as np
import json

In [4]:
# Load leet dictionary/frequent pattern with frequency
leet_dict = {}
 
# Opening JSON file
with open('train1_leetDict.json') as json_file:
    leet_dict = json.load(json_file)

leet_dict


{'A': {'’': 1678,
  '8': 507,
  "'": 199,
  '2': 1261,
  '3': 1020,
  ';': 1967,
  '(': 303,
  '“': 245,
  '…': 538,
  '1': 1947,
  '7': 260,
  '0': 2476,
  '"': 473,
  '”': 50,
  '4': 538,
  ')': 308,
  '‘': 66,
  '5': 1408,
  '.': 322,
  ':': 465,
  '6': 306,
  '$': 554,
  '9': 480,
  '=': 22,
  '\xa0': 24,
  '/': 206,
  '*': 241,
  ',': 31,
  '-': 215,
  '\u3000': 1,
  '£': 2,
  '&': 42,
  '→': 1,
  '☭': 5,
  '+': 60,
  '!': 41,
  '»': 2,
  '↓': 1,
  '%': 3,
  '^': 12,
  '?': 3,
  '{': 2,
  '¿': 6,
  '_': 2,
  ']': 13,
  '²': 26,
  '⁷': 18,
  '⁸': 18,
  '#': 6,
  '[': 27,
  '。': 1,
  '—': 34,
  '°': 2,
  '\u202f': 1},
 'B': {'’': 1037,
  '8': 132,
  '?': 2,
  '*': 49,
  '(': 259,
  '7': 72,
  '0': 163,
  '3': 282,
  '$': 443,
  '"': 201,
  '2': 317,
  '‘': 41,
  ')': 78,
  '“': 151,
  '1': 468,
  '5': 167,
  '-': 38,
  ':': 88,
  '…': 208,
  '.': 60,
  ',': 1,
  '⁵': 11,
  '9': 95,
  '4': 108,
  '6': 60,
  "'": 45,
  '{': 1,
  '£': 8,
  ';': 6,
  '!': 2,
  '»': 2,
  '+': 18,
  '\xa0

In [5]:
# load test data

test = pd.read_csv("test.csv")
print("Test Dataset:")
test.info()

#test["reply"].values.size

#test["keyword"].values[0] -> 'weather'

Test Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51077 entries, 0 to 51076
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   keyword      51077 non-null  object
 1   main_tweet   51077 non-null  object
 2   main_likes   51077 non-null  int64 
 3   reply        51077 non-null  object
 4   reply_likes  51077 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.9+ MB


In [6]:
#setup pre-trained roberta model from https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest 
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Run for Roberta Model
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

print(polarity_scores_roberta("I am happy"))

{'roberta_neg': 0.0077943704, 'roberta_neu': 0.030709133, 'roberta_pos': 0.96149653}


# LeetSpeak processing

In [8]:
#leetWord processing
#from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
import pandas as pd
import re
import emoji

MAX_LEET_WORD_SIZE = 15 # word length
# load common words
commonWords_df = pd.read_csv('commonWords.csv')

# dictionary for possible leet substitutions with frequency/count {'A': {}, 'B': {},...,'Z': {}}
leetDict = {} 

def setupLeetDict():

    for asc in range(ord('A'), ord('Z')+1):
        leetDict[chr(asc)] = {}

def getWordFrequencyPercentile(percentile):
    return commonWords_df.quantile(percentile, numeric_only=True).values[0]

def getWordFrequency(word):
    wordFrequency = 0
    if word in commonWords_df['word'].values:
        wordRowIndex = commonWords_df.index[commonWords_df['word']==word].tolist()[0]
        wordFrequency = commonWords_df['count'].loc[commonWords_df.index[wordRowIndex]]
    return wordFrequency
    
def getLeetWordList(inputStr):
    regex = ".*[a-zA-Z].*" # regex for string containing at least 1 alphabet, to filter just numbers/special character/emojis tokens
    # tokenList = word_tokenize(inputStr)
    demojized = emoji.replace_emoji(inputStr, replace='') # remove emojis
    tokenList = demojized.split(" ")
    leetList = []
    #print(emoji.replace_emoji('hi🤔.', replace=''))
    for word in tokenList:
        word = word.strip(".,?!#@") # strip common leading/trailing punctuations, hashtags, tag usernames
        if not word.isalpha() and re.search(regex, word) and len(word) > 1 and len(word) <= MAX_LEET_WORD_SIZE:
            leetList.append(word)

    return leetList # returns the list of leetwords in a given text body

def getMatchList(leetWord):
    # get candidates of equal length
    spell = SpellChecker()
    candidateList = spell.candidates(leetWord)
    commonCands = []
    if candidateList is not None:
        for word in candidateList:
            if getWordFrequency(word) >= getWordFrequencyPercentile(0.70):# and len(word) == len(leetWord):
                commonCands.append(word)

    return commonCands # e.g: ['hello', 'hella', 'hells'] for hell0


def getPossibleSubstitutions(leetWord, candidates):
    # candidates = getMatchList(leetWord)
    # print(leetWord)
    # print(candidates)
    substitutions = {}
    
    for word in candidates:
        charPos = 0
        if len(word) == len(leetWord):       #count substitutions only for same length words/cands 
            for char in leetWord:
                if not char.isalpha():
                    subAlpha = word[charPos].upper()
                    if subAlpha not in substitutions:
                        substitutions[subAlpha] = {char:1}
                    else:
                        if char not in substitutions[subAlpha]:
                            substitutions[subAlpha][char] = 1
                        else:
                            substitutions[subAlpha][char] += 1

                charPos += 1

    return substitutions # e.g: {'A': {'3',1}, 'E': {'3',1}} 

# for a given leet word , add the possible leet substituions to the global leet dictionary
def updateLeetDict(subs):
    # subs = getPossibleSubstitutions(leetWord)
    for key in subs:
        for leetChar in subs[key]:
            if leetChar in leetDict[key]: # if '3' is in list of 'E'
                leetDict[key][leetChar] += subs[key][leetChar]
            else:
                leetDict[key][leetChar] = subs[key][leetChar]

def processTextInput(textInput): # per row processing: get leetWords, for each leetWord get matches, for each match get context and update leetDict
    leetWordList = getLeetWordList(textInput)
    print("LeetWord List:" + str(leetWordList))
    for lword in leetWordList:
        possibleMatches = getMatchList(lword)
        possibleSubs = getPossibleSubstitutions(lword, possibleMatches)
        updateLeetDict(possibleSubs)
        print("LeetWord: " + lword)
        print("Candidate list: " + str(possibleMatches))
        print("Possible Substitutions and counts: " + str(possibleSubs))


# processTextInput("Wow. Mr. Bezos replied to the founder of $doge. Much wow!#crypto #doge")
# print("Final Leet Dictionary: " + str(leetDict))

testStr = "He11o W0rld !!"

setupLeetDict()     

processTextInput(testStr)
print("Final Leet Dictionary: " + str(leetDict))

LeetWord List:['He11o', 'W0rld']
LeetWord: He11o
Candidate list: ['hero', 'hecho', 'hello']
Possible Substitutions and counts: {'C': {'1': 1}, 'H': {'1': 1}, 'L': {'1': 2}}
LeetWord: W0rld
Candidate list: ['world']
Possible Substitutions and counts: {'O': {'0': 1}}
Final Leet Dictionary: {'A': {}, 'B': {}, 'C': {'1': 1}, 'D': {}, 'E': {}, 'F': {}, 'G': {}, 'H': {'1': 1}, 'I': {}, 'J': {}, 'K': {}, 'L': {'1': 2}, 'M': {}, 'N': {}, 'O': {'0': 1}, 'P': {}, 'Q': {}, 'R': {}, 'S': {}, 'T': {}, 'U': {}, 'V': {}, 'W': {}, 'X': {}, 'Y': {}, 'Z': {}}


In [9]:
#get the best match using the leet FP list to replace the leetword
def getBestMatch(leetWord):
    matchedWord = list(leetWord)
    #trim the leet word first, get rid of all the unnecessary punctuation 
    candList = getMatchList(leetWord)
    
    if len(candList) > 0:
        charPos = 0
        for char in leetWord:
            subAlphas = []
            subAlphaScores = []
            if not char.isalpha():
                for cand in candList:
                    if len(cand) == len(leetWord):
                        subAlphas.append(cand[charPos].upper())
                        if char in leet_dict[cand[charPos].upper()]:
                            subAlphaScores.append(leet_dict[cand[charPos].upper()][char])
                        else:
                            subAlphaScores.append(0)
                if len(subAlphaScores) > 0:
                    maxConf = max(subAlphaScores)
                    index = subAlphaScores.index(maxConf)
                    matchedWord[charPos] = subAlphas[index]
            charPos += 1
    
    matchedWord = "".join(matchedWord)

    spell = SpellChecker()
    finalMatch = spell.correction(matchedWord)
    
    return finalMatch

print(getBestMatch("$Helly"))

SHelly


In [10]:
def replaceLeet(reply,leetWord,bestMatch):
    #nreply = getLeetWordList(reply)
    tokenize = reply.split(" ")
    nreply = []
    for word in tokenize:
        if leetWord in word:
            nreply.append(bestMatch)
        else:
            nreply.append(word)
    nreply = " ".join(nreply)

    return nreply

replaceLeet("Wow. Mr. Bezos replied to the founder of $doge. Much wow!#crypto #doge", "$doge", "doge")
#getLeetWordList("Wow. Mr. Bezos replied to the founder of $doge. Much wow!#crypto #doge")

'Wow. Mr. Bezos replied to the founder of doge Much wow!#crypto #doge'

In [11]:
test['reply'].values[996]

'Wow. Mr. Bezos replied to the founder of $doge. Much wow!#crypto #doge'

In [12]:
# Run leet algorithm on train data to extract leet words and their substitutions

test2_result = {'keyword':[], 'reply':[], 'leetWords':[], 'BestMatches':[], 'NewReply':[],'Neg':[], 'Neu':[], 'Pos':[]}

for i in trange(10):#len(test['reply'].values)):
    test2_result['keyword'].append(test['keyword'].values[i])
    test2_result['reply'].append(test['reply'].values[i])

    leetWords = getLeetWordList(test['reply'].values[i])
    test2_result['leetWords'].append(leetWords)
    bestMatches = []
    newReply = test['reply'].values[i]

    for lword in leetWords:
        bestMatch = getBestMatch(lword)
        bestMatches.append(bestMatch)
        if bestMatch is not None:
            newReply = replaceLeet(newReply, lword, bestMatch)
    test2_result['BestMatches'].append(bestMatches)


    newReply = "".join(newReply)
    test2_result['NewReply'].append(newReply)

    polarity_scores = polarity_scores_roberta(newReply)
    test2_result['Neg'].append(polarity_scores['roberta_neg'])
    test2_result['Neu'].append(polarity_scores['roberta_neu'])
    test2_result['Pos'].append(polarity_scores['roberta_pos'])

test2_result_df = pd.DataFrame(test2_result)
test2_result_df.head()

100%|██████████| 10/10 [00:08<00:00,  1.14it/s]


Unnamed: 0,keyword,reply,leetWords,BestMatches,NewReply,Neg,Neu,Pos
0,Dogecoin,Hodling like there is no tomorrow!,[],[],Hodling like there is no tomorrow!,0.061756,0.309112,0.629132
1,Vaccine,As they tell us to not worry. 👀,[],[],As they tell us to not worry. 👀,0.078209,0.717959,0.203832
2,World Cup,I love this,[],[],I love this,0.009743,0.039927,0.95033
3,COVID-19,"I hope so, but i have no need of immediate inf...",[exploration.May],[None],"I hope so, but i have no need of immediate inf...",0.015798,0.124445,0.859756
4,COVID-19,Nuremberg doctors' trial was an important mile...,[doctors'],[doctors],Nuremberg doctors trial was an important miles...,0.005688,0.115113,0.8792


In [13]:
test2_result_df["NewReply"].values[5]
test2_result_df[test2_result_df['leetWords'].str.len() >0] #just the rows with leetspeak

Unnamed: 0,keyword,reply,leetWords,BestMatches,NewReply,Neg,Neu,Pos
3,COVID-19,"I hope so, but i have no need of immediate inf...",[exploration.May],[None],"I hope so, but i have no need of immediate inf...",0.015798,0.124445,0.859756
4,COVID-19,Nuremberg doctors' trial was an important mile...,[doctors'],[doctors],Nuremberg doctors trial was an important miles...,0.005688,0.115113,0.8792
5,Elon Musk,"As J-Lo says, ""I'm real...""","[J-Lo, ""I'm, real...""]","[solo, i'm, None]","As solo says, i'm real...""",0.011842,0.539687,0.448471
7,nba,"Let’s goooo, Kid Kessler!",[Let’s],[let's],"let's goooo, Kid Kessler!",0.00441,0.059691,0.935899
8,Vaccine,"It was the State colluding with Healthcare, La...",[force/coerce],[None],"It was the State colluding with Healthcare, La...",0.817865,0.170145,0.01199


In [109]:
#single reply test
# test2_result = {'keyword':[], 'reply':[], 'leetWords':[], 'BestMatches':[], 'NewReply':[],'Neg':[], 'Neu':[], 'Pos':[]}


# test2_result['keyword'].append(test['keyword'].values[996])
# test2_result['reply'].append(test['reply'].values[996])

# leetWords = getLeetWordList(test['reply'].values[996])
# test2_result['leetWords'].append(leetWords)
# bestMatches = []
# newReply = test['reply'].values[996]

# for lword in leetWords:
#     bestMatch = getBestMatch(lword)
#     bestMatches.append(bestMatch)
#     if bestMatch is not None:
#         newReply = replaceLeet(newReply, lword, bestMatch)
# test2_result['BestMatches'].append(bestMatches)


# newReply = "".join(newReply)
# test2_result['NewReply'].append(newReply)

# polarity_scores = polarity_scores_roberta(newReply)
# test2_result['Neg'].append(polarity_scores['roberta_neg'])
# test2_result['Neu'].append(polarity_scores['roberta_neu'])
# test2_result['Pos'].append(polarity_scores['roberta_pos'])

# test2_result_df = pd.DataFrame(test2_result)
# test2_result_df.head()
# test2_result_df['NewReply'].values[0]


In [None]:
# test1, test2 = train_test_split(test, test_size=0.50)
# test1.reset_index(drop=True)
# test2.reset_index(drop=True)
# test1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25538 entries, 40012 to 2916
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   keyword      25538 non-null  object
 1   main_tweet   25538 non-null  object
 2   main_likes   25538 non-null  int64 
 3   reply        25538 non-null  object
 4   reply_likes  25538 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.2+ MB


In [18]:
# Run leet algorithm on train data to extract leet words and their substitutions

# test2_result1 = {'keyword':[], 'reply':[], 'leetWords':[], 'BestMatches':[], 'NewReply':[],'Neg':[], 'Neu':[], 'Pos':[]}
# test2_result2 = {'keyword':[], 'reply':[], 'leetWords':[], 'BestMatches':[], 'NewReply':[],'Neg':[], 'Neu':[], 'Pos':[]}

# def runTest2(data):
#     test2_result1 = {'keyword':[], 'reply':[], 'leetWords':[], 'BestMatches':[], 'NewReply':[],'Neg':[], 'Neu':[], 'Pos':[]}
#     for i in trange(500):#len(test['reply'].values)):
#         test2_result1['keyword'].append(data['keyword'].values[i])
#         test2_result1['reply'].append(data['reply'].values[i])

#         leetWords = getLeetWordList(data['reply'].values[i])
#         test2_result1['leetWords'].append(leetWords)
#         bestMatches = []
#         newReply = data['reply'].values[i]

#         for lword in leetWords:
#             bestMatch = getBestMatch(lword)
#             bestMatches.append(bestMatch)
#             if bestMatch is not None:
#                 newReply = replaceLeet(newReply, lword, bestMatch)
#         test2_result['BestMatches'].append(bestMatches)


#         newReply = "".join(newReply)
#         test2_result1['NewReply'].append(newReply)

#         polarity_scores = polarity_scores_roberta(newReply)
#         test2_result1['Neg'].append(polarity_scores['roberta_neg'])
#         test2_result1['Neu'].append(polarity_scores['roberta_neu'])
#         test2_result1['Pos'].append(polarity_scores['roberta_pos'])
#     return test2_result1


# test2_result1_df = pd.DataFrame(test2_result)
# test2_result1_df.head()


Unnamed: 0,keyword,reply,leetWords,BestMatches,NewReply,Neg,Neu,Pos
0,Dogecoin,Hodling like there is no tomorrow!,[],[],Hodling like there is no tomorrow!,0.061756,0.309112,0.629132
1,Vaccine,As they tell us to not worry. 👀,[],[],As they tell us to not worry. 👀,0.078209,0.717959,0.203832
2,World Cup,I love this,[],[],I love this,0.009743,0.039927,0.95033
3,COVID-19,"I hope so, but i have no need of immediate inf...",[exploration.May],[None],"I hope so, but i have no need of immediate inf...",0.015798,0.124445,0.859756
4,COVID-19,Nuremberg doctors' trial was an important mile...,[doctors'],[doctors],Nuremberg doctors trial was an important miles...,0.005688,0.115113,0.8792


In [19]:
# import multiprocessing as mp

# #print(mp.cpu_count()) #8 cpu

# p1 = mp.Process(target=runTest2(test1))

8
