In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm, trange

In [2]:
from datasets import load_dataset_builder
builder = load_dataset_builder("SocialGrep/one-million-reddit-jokes")

Downloading readme:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

In [4]:
builder.info.features

In [5]:
from datasets import get_dataset_split_names


get_dataset_split_names("SocialGrep/one-million-reddit-jokes")

['train']

In [7]:
from datasets import load_dataset

dataset = load_dataset("SocialGrep/one-million-reddit-jokes", split="train[:10%]")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/300M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
dataset

Dataset({
    features: ['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw', 'created_utc', 'permalink', 'domain', 'url', 'selftext', 'title', 'score'],
    num_rows: 100000
})

In [16]:
df = pd.read_csv('../reddit/one-million-reddit-jokes.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   type            1000000 non-null  object
 1   id              1000000 non-null  object
 2   subreddit.id    1000000 non-null  object
 3   subreddit.name  1000000 non-null  object
 4   subreddit.nsfw  1000000 non-null  bool  
 5   created_utc     1000000 non-null  int64 
 6   permalink       1000000 non-null  object
 7   domain          1000000 non-null  object
 8   url             4472 non-null     object
 9   selftext        995485 non-null   object
 10  title           1000000 non-null  object
 11  score           1000000 non-null  int64 
dtypes: bool(1), int64(2), object(9)
memory usage: 84.9+ MB


In [21]:
df = df.drop(columns=['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw', 'created_utc', 'permalink', 'domain', 'url'])

In [27]:
type(df['selftext'][3])
df = df[df['selftext']!='[removed]']
df = df[df['selftext']!='[deleted]']
df

Unnamed: 0,selftext,title,score
0,My corona is covered with foreskin so it is no...,I am soooo glad I'm not circumcised!,2
1,It's called Google Sheets.,Did you know Google now has a platform for rec...,9
2,The vacuum doesn't snore after sex.\n\n&amp;#x...,What is the difference between my wife and my ...,15
7,Oo..lala...,What did the French man say to the attractive ...,2
10,"Yo momma's so fat, that when she went to the z...",Yo Mama,0
...,...,...,...
999992,Q: What do you call a lawyer who has gone bad?...,BAD LAWYER,3
999994,Supposedly she had to rush the delivery!,Did you hear about the FedEx lady who had a ba...,2
999995,*zyan malik or whatever leaves 1d. \n*Kayne W...,With Zyan Malik leaving 1D..,0
999997,I'll be Bach,What did Arnold Schwarzenegger say when invite...,0


In [28]:
df.to_csv('../reddit/reddit-jokes.csv', index=False)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 578637 entries, 0 to 999998
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   selftext  574122 non-null  object
 1   title     578637 non-null  object
 2   score     578637 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 17.7+ MB


In [2]:
train = pd.read_csv("train.csv")
print("\nTraining Dataset:")
train.info()


Training Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119178 entries, 0 to 119177
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   keyword      119178 non-null  object
 1   main_tweet   119178 non-null  object
 2   main_likes   119178 non-null  int64 
 3   reply        119178 non-null  object
 4   reply_likes  119178 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 4.5+ MB


# LeetSpeak processing

In [5]:
import LeetMining as lm
lm.setupLeetDict()
# lm.processTextInput("He11o W0rld !!")
# print(lm.getLeetDict())

LeetWord List:['He11o', 'W0rld']
LeetWord: He11o
Candidate list: ['hello', 'hero', 'hecho']
Possible Substitutions and counts: {'L': {'1': 2}, 'C': {'1': 1}, 'H': {'1': 1}}
LeetWord: W0rld
Candidate list: ['world']
Possible Substitutions and counts: {'O': {'0': 1}}


In [14]:
# SMALL TEST WITH LEET CODE: Run Roberta and leet decode algorithm on train data to get analysis post leet substitution
testLeet = {'keyword':[], 'reply':[], 'leetWords':[], 'candidates':[]}

for i in trange(100):
    testLeet['keyword'].append(train['keyword'].values[i])
    testLeet['reply'].append(train['reply'].values[i])

    leetWords = lm.getLeetWordList(train['reply'].values[i])
    testLeet['leetWords'].append(leetWords)
    candidates = []
    for lword in leetWords:
        possibleMatches = lm.getMatchList(lword)
        candidates.append(possibleMatches)
        possibleSubs = lm.getPossibleSubstitutions(lword, possibleMatches)
        lm.updateLeetDict(possibleSubs)
    testLeet['candidates'].append(candidates)




100%|██████████| 100/100 [00:08<00:00, 11.16it/s]


In [15]:
testLeet_df = pd.DataFrame(testLeet)
testLeet_df.head()

Unnamed: 0,keyword,reply,leetWords,candidates
0,Dogecoin,You'll be degrading your empowerment and dissi...,[],[]
1,Ukraine,Russia is the one with a uniquely bad NAZI pro...,[],[]
2,weather,"be thankful thats all you got , we got tornado...",[],[]
3,Queen Elizabeth,What a surprise - not,[],[]
4,Vaccine,"No, actually the doctors hope that “you’ll “ b...",[],[]


In [8]:
testLeet_df['reply'].values[3]

'What a surprise - not'

In [16]:
# testLeet_df.query('leetWords ')
# testLeet_df[testLeet_df['leetWords'].size >0]
# len(testLeet_df['leetWords'].values)

#df['CreationDate'].str.len() -> gives length of arrays stored in cell https://stackoverflow.com/questions/41340341/how-to-determine-the-length-of-lists-in-a-pandas-dataframe-column
testLeet_df[testLeet_df['leetWords'].str.len() >0] #just the rows with leetspeak

Unnamed: 0,keyword,reply,leetWords,candidates
7,Bitcoin,what the fuck b*tc**n is legal tender in el sa...,[b*tc**n],[[]]
9,World Cup,So what about the 1000s England has killed and...,[1000s],[[]]
14,COVID-19,Canada need to Pay attention to weak peoples a...,[160Vanderhoof],[[]]
21,COVID-19,I’m retweeting this again due to variants XBB....,"[XBB.1.5, (origin, York), BF.7, (origin, China...","[[], [origin], [york], [], [origin], [china], ..."
60,nba,The only rookie &gt;&gt; Keegan Murray is Paol...,[gt;gt;],[[]]
67,Dogecoin,"Wow, this might be the most valuable remaining...","[FTX(Though, that)]","[[], [that, thats]]"
70,Elon Musk,In case you missed it:😂,[it:],"[[its, itu, ito, itt, ity, itl, ita, iti, it, ..."
73,Dogecoin,While Twitter cooks...you could commercializet...,"[cooks...you, of:1, BusinessNews2]","[[], [ofa, offa, of, ofr, offs, offi, off, oft..."
76,Dogecoin,"This didn’t age well , right? How many followe...",[loose#SHIB],[[]]
78,TikTok,Calling early-stage founders! Awesome opportun...,"[early-stage, amp;]","[[], [amp, amps]]"


In [10]:
print(lm.getLeetDict())

{'A': {':': 1, '1': 1, '/': 1, '2': 3, '0': 10}, 'B': {'2': 3, '0': 1}, 'C': {}, 'D': {'2': 3}, 'E': {':': 1, '/': 1, '0': 6, '2': 1}, 'F': {':': 3, '2': 1}, 'G': {}, 'H': {':': 1, '2': 3}, 'I': {':': 1, '1': 1, '/': 1, '0': 10, '2': 1}, 'J': {}, 'K': {}, 'L': {':': 1, '2': 3}, 'M': {'2': 3}, 'N': {'2': 2, '0': 2}, 'O': {':': 1, ';': 1, '2': 1, '0': 6}, 'P': {'2': 3}, 'Q': {}, 'R': {'0': 2, '2': 3}, 'S': {';': 5, ')': 2, ':': 1, '1': 1, '0': 1, '2': 4}, 'T': {':': 1, '$': 2, '0': 2, '2': 3}, 'U': {':': 1, '/': 2, '2': 2, '0': 2}, 'V': {'2': 2}, 'W': {}, 'X': {}, 'Y': {':': 1}, 'Z': {'2': 1}}


In [17]:
# train1, train2 = train_test_split(train, test_size=0.30)
# train1.reset_index(drop=True)
# train2.reset_index(drop=True)
# print("Training Dataset1:")
# train1.info()
# print("\nTraining Dataset2:")
# train2.info()

In [18]:
# Run leet algorithm on train data to extract leet words and their substitutions
lm.setupLeetDict() # initiate FP list
trainLeet = {'keyword':[], 'reply':[], 'leetWords':[], 'candidates':[]}

for i in trange(len(train['reply'].values)):
    trainLeet['keyword'].append(train['keyword'].values[i])
    trainLeet['reply'].append(train['reply'].values[i])

    leetWords = lm.getLeetWordList(train['reply'].values[i])
    trainLeet['leetWords'].append(leetWords)
    candidates = []
    for lword in leetWords:
        possibleMatches = lm.getMatchList(lword)
        candidates.append(possibleMatches)
        possibleSubs = lm.getPossibleSubstitutions(lword, possibleMatches)
        lm.updateLeetDict(possibleSubs)
    trainLeet['candidates'].append(candidates)

train1_result_df = pd.DataFrame(trainLeet)
train1_result_df.head()

100%|██████████| 119178/119178 [3:51:32<00:00,  8.58it/s]  


Unnamed: 0,keyword,reply,leetWords,candidates
0,Dogecoin,You'll be degrading your empowerment and dissi...,[],[]
1,Ukraine,Russia is the one with a uniquely bad NAZI pro...,[],[]
2,weather,"be thankful thats all you got , we got tornado...",[],[]
3,Queen Elizabeth,What a surprise - not,[],[]
4,Vaccine,"No, actually the doctors hope that “you’ll “ b...",[],[]


In [19]:
#collect training results - leet dict and the result table
import json
with open("twitter_leetDict.json", "w") as outfile: 
    json.dump(lm.getLeetDict(), outfile)

In [35]:
# import numpy as np
# leetDict['B']

# allKeysCounts = []
# allKeys = []
# for key in leetDict["B"].keys():
#     allKeys.append(key)
#     allKeysCounts.append(leetDict["B"][key])
    
#     #print(key, leetDict["F"][key])
# f_set = np.array(leetDict["B"])
# print(f_set)

# f_set_df = pd.DataFrame()

{'’': 1037, '8': 132, '?': 2, '*': 49, '(': 259, '7': 72, '0': 163, '3': 282, '$': 443, '"': 201, '2': 317, '‘': 41, ')': 78, '“': 151, '1': 468, '5': 167, '-': 38, ':': 88, '…': 208, '.': 60, ',': 1, '⁵': 11, '9': 95, '4': 108, '6': 60, "'": 45, '{': 1, '£': 8, ';': 6, '!': 2, '»': 2, '+': 18, '\xa0': 3, '/': 14, '”': 5, '=': 3, '²': 10, '⁷': 6, '⁸': 6, '`': 1, '[': 12, '_': 1, '—': 10, '^': 7, ']': 1}


In [20]:
# train1_result_df.to_csv('train1_result.csv', index=False)
# train1.to_csv('train1.csv', index=False)
train1_result_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119178 entries, 0 to 119177
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   keyword     119178 non-null  object
 1   reply       119178 non-null  object
 2   leetWords   119178 non-null  object
 3   candidates  119178 non-null  object
dtypes: object(4)
memory usage: 3.6+ MB


In [21]:
#print leetword rows only
train1_result_df[train1_result_df['leetWords'].str.len() >0]

Unnamed: 0,keyword,reply,leetWords,candidates
7,Bitcoin,what the fuck b*tc**n is legal tender in el sa...,[b*tc**n],[[]]
9,World Cup,So what about the 1000s England has killed and...,[1000s],[[]]
14,COVID-19,Canada need to Pay attention to weak peoples a...,[160Vanderhoof],[[]]
21,COVID-19,I’m retweeting this again due to variants XBB....,"[XBB.1.5, (origin, York), BF.7, (origin, China...","[[], [origin], [york], [], [origin], [china], ..."
60,nba,The only rookie &gt;&gt; Keegan Murray is Paol...,[gt;gt;],[[]]
...,...,...,...,...
119160,Dogecoin,It will get a top-notch care,[top-notch],[[]]
119163,Vaccine,"#ErasingWomenWomen, pregnant women.People incl...",[women.People],[[]]
119167,nba,$mommie0198 I absolutely love and adore my 2 d...,[$mommie0198],[[]]
119168,weather,At least A&amp;W gets it. I quit buying Coke c...,"[Aamp;W, Aamp;W]","[[], []]"


In [22]:
print(lm.getLeetDict())

{'A': {':': 969, '1': 3934, '/': 401, '2': 2548, '0': 7179, '(': 571, '5': 2972, ')': 540, '$': 981, '8': 1091, '3': 1889, '6': 620, '*': 481, '9': 1480, '-': 447, '7': 728, '4': 1154, '.': 706, ';': 805, '£': 13, '[': 31, '«': 136, '\xa0': 207, '=': 27, '{': 2, '–': 1, '!': 130, '+': 62, '¿': 12, '²': 52, '⁷': 18, '⁸': 18, '_': 11, '☭': 7, '—': 51, '»': 3, '#': 19, '\u202f': 1, '„': 4, '￼': 1, '^': 14, '🖒': 3, '%': 8, '→': 1, '？': 1, '~': 17, ']': 19, '。': 1, '¡': 1, '°': 3, '⁵': 18, '⁶': 18, '\u3000': 1, '↓': 1, '•': 1, '\u200b': 12}, 'B': {'2': 695, '0': 576, '(': 540, '/': 16, '$': 828, '1': 917, ':': 195, '3': 552, '5': 346, '9': 404, '7': 222, '=': 5, '*': 68, '6': 146, ')': 150, '8': 317, ';': 201, '⁵': 29, '[': 14, '4': 287, '.': 250, '\xa0': 39, '-': 56, '£': 12, '+': 20, '￼': 1, '`': 1, '!': 25, '⁷': 6, '²': 20, '⁸': 6, '—': 16, '»': 3, '#': 5, '{': 1, '🖒': 3, '♡': 1, '~': 4, ']': 1, '¡': 2, '«': 1, '⁶': 6, '_': 1, '^': 7, '\u200b': 2}, 'C': {'(': 568, '/': 13, '2': 441, '$':