In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm, trange

In [2]:
train = pd.read_csv("train_yt.csv")
train = train.sample(frac = 0.75) # limiting train size due to time limit 
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80290 entries, 47884 to 98114
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    80290 non-null  object
dtypes: object(1)
memory usage: 1.2+ MB


In [3]:
#load existing leet dictionary to initiate
import json
leet_fp = {}
with open('../accuracy/pyleetspeak_leetDict.json') as json_file:
    leet_fp = json.load(json_file)

leet_fp

{'A': {'0': 372, '3': 402, '4': 266, '@': 284, '_': 278, '1': 348},
 'B': {'0': 40, '4': 20, '3': 38, '1': 36, '_': 40, '@': 20},
 'C': {'3': 35, '@': 26, '4': 12, '_': 24, '1': 29, '0': 38},
 'D': {'0': 46, '3': 47, '4': 33, '@': 33, '_': 51, '1': 34},
 'E': {'3': 514, '0': 299, '1': 245, '_': 216, '@': 145, '4': 158},
 'F': {'0': 29, '@': 23, '_': 27, '1': 21, '3': 23, '4': 13},
 'G': {'3': 53, '_': 41, '0': 45, '1': 39, '@': 30, '4': 18},
 'H': {'3': 34, '0': 44, '@': 24, '_': 28, '4': 23, '1': 29},
 'I': {'1': 358, '0': 254, '@': 120, '3': 246, '4': 107, '_': 190},
 'J': {'3': 11, '_': 10, '1': 8, '@': 7, '4': 3, '0': 9},
 'K': {'0': 26, '3': 23, '@': 22, '_': 18, '4': 11, '1': 31},
 'L': {'0': 69, '3': 61, '@': 44, '_': 38, '1': 53, '4': 29},
 'M': {'4': 31, '3': 41, '@': 33, '_': 45, '1': 41, '0': 48},
 'N': {'0': 74, '3': 83, '@': 37, '4': 42, '_': 61, '1': 57},
 'O': {'0': 397, '3': 289, '@': 134, '1': 265, '4': 146, '_': 211},
 'P': {'4': 30, '3': 36, '_': 38, '1': 33, '@': 48

In [4]:
import LeetMining as lm
lm.setupLeetDict()
lm.initiateLeetDict(leet_dict=leet_fp)

# LeetSpeak processing

In [5]:
# Run leet algorithm on train data to extract leet words and their substitutions
trainLeet = {'text':[], 'leetWords':[], 'candidates':[]}

for i in trange(len(train['text'].values)):
    trainLeet['text'].append(train['text'].values[i])

    leetWords = lm.getLeetWordList(train['text'].values[i])
    trainLeet['leetWords'].append(leetWords)
    candidates = []
    for lword in leetWords:
        possibleMatches = lm.getMatchList(lword)
        candidates.append(possibleMatches)
        possibleSubs = lm.getPossibleSubstitutions(lword, possibleMatches)
        lm.updateLeetDict(possibleSubs)
    trainLeet['candidates'].append(candidates)

train1_result_df = pd.DataFrame(trainLeet)
train1_result_df.head()

100%|██████████| 80290/80290 [2:52:39<00:00,  7.75it/s]   


Unnamed: 0,text,leetWords,candidates
0,Never gonna give you up!! Specially made for y...,[],[]
1,"but I'm waiting on my 2060, does that mean not...",[],[]
2,"After 3 months they get a pension, what a life...",[],[]
3,Yes.,[],[]
4,What trump had a yt channel?! Would have never...,[],[]


In [7]:
# print rows with leetspeak
train1_result_df[train1_result_df['leetWords'].str.len() >0] #just the rows with leetspeak

Unnamed: 0,text,leetWords,candidates
7,"In Yuma, Arizona, USA they used to use a batht...","[(saw, up)]","[[saw], [upa, ups, upp, up]]"
9,"""<a href=""""https://www.youtube.com/watch?v=0Sz...","[com/watch, v=0Sz55gmNUaI]","[[], []]"
18,18th,[18th],"[[rath, moth, seth, path, myth, lith, ith, cat..."
19,Tom: I'm about twice your age and don't hear m...,"[Tom:, 10KHz]","[[tomo, tome, tomb, tom, toms], []]"
22,Estimated waiting time:<br>4m 37s<br>4m 36s<br...,"[time:, 4m, 37s, 4m, 36s, 4m, 35s, 4m, 32s, 4m...","[[timed, times, time, timer], [om, um, em, am,..."
...,...,...,...
80272,"""@<a href=""""https://www.youtube.com/watch?v=pK...","[com/watch, v=pKkFaeig1zo]","[[], []]"
80284,"""<a href=""""https://www.youtube.com/watch?v=98H...","[com/watch, v=98HZanvAJ8Y]","[[], []]"
80285,James: Update your iphones!<br><br>Me on jail ...,[James:],[[james]]
80287,27 years ago I paid $250 to get a massive 40MB...,"[40MB, 40MB]","[[comb, lamb, limb, dumb, jamb, womb, bomb, to..."


In [12]:
#go through the result df to see what causes challenges in detecting leetspeak? -> measurement units with numbers, href, etc
train1_result_df.to_csv('train_result.csv')

In [10]:
print(lm.getLeetDict())

{'A': {'0': 6784, '3': 1670, '4': 1476, '@': 293, '_': 315, '1': 2268, ')': 338, '8': 1176, '7': 740, '6': 601, '5': 1394, '2': 1796, ':': 1466, ';': 3004, '(': 349, '/': 1002, '9': 1737, '^': 11, '*': 219, '-': 778, '+': 484, '%': 21, '`': 14, '~': 36, '$': 173, '#': 47, '=': 30, '°': 21, '!': 16, '´': 4, '²': 1, '„': 4, ']': 13, '\xa0': 65, '[': 5, '—': 1, '\\': 23, '€': 21, '˹': 1, '˺': 2, '―': 3, '£': 4, '¹': 15, '|': 2, '«': 13, '☐': 13, '《': 1}, 'B': {'0': 619, '4': 363, '3': 396, '1': 840, '_': 51, '@': 24, ':': 321, '7': 156, '6': 131, '5': 221, '2': 614, ';': 764, ')': 114, '(': 420, '9': 557, '*': 75, '8': 293, '+': 98, '[': 9, '`': 7, '-': 88, '$': 108, '/': 204, '#': 3, '°': 2, '»': 1, '´': 4, '„': 3, '=': 1, '^': 1, '~': 7, '\xa0': 24, ']': 9, '\\': 32, '€': 2, '˹': 4, '―': 3, '£': 3, '«': 5, '☐': 7, '《': 1}, 'C': {'3': 386, '@': 27, '4': 307, '_': 33, '1': 688, '0': 209, '5': 251, ':': 210, ';': 526, '(': 416, '7': 175, '/': 174, '9': 767, '6': 118, '2': 506, '8': 362, '-

In [9]:
#collect training results - leet dict and the result table
import json
with open("youtube_leetDict.json", "w") as outfile: 
    json.dump(lm.getLeetDict(), outfile)