In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm, trange
# from pyleetspeak import LeetSpeaker


In [8]:
#build frequent pattern/substitions list from training dataset
import LeetMining as lm
import json
leet_fp = {}
with open('pyleetspeak_leetDict.json') as json_file:
    leet_fp = json.load(json_file)

In [9]:
#mined frequent patterns
leet_fp

{'A': {'0': 372, '3': 402, '4': 266, '@': 284, '_': 278, '1': 348},
 'B': {'0': 40, '4': 20, '3': 38, '1': 36, '_': 40, '@': 20},
 'C': {'3': 35, '@': 26, '4': 12, '_': 24, '1': 29, '0': 38},
 'D': {'0': 46, '3': 47, '4': 33, '@': 33, '_': 51, '1': 34},
 'E': {'3': 514, '0': 299, '1': 245, '_': 216, '@': 145, '4': 158},
 'F': {'0': 29, '@': 23, '_': 27, '1': 21, '3': 23, '4': 13},
 'G': {'3': 53, '_': 41, '0': 45, '1': 39, '@': 30, '4': 18},
 'H': {'3': 34, '0': 44, '@': 24, '_': 28, '4': 23, '1': 29},
 'I': {'1': 358, '0': 254, '@': 120, '3': 246, '4': 107, '_': 190},
 'J': {'3': 11, '_': 10, '1': 8, '@': 7, '4': 3, '0': 9},
 'K': {'0': 26, '3': 23, '@': 22, '_': 18, '4': 11, '1': 31},
 'L': {'0': 69, '3': 61, '@': 44, '_': 38, '1': 53, '4': 29},
 'M': {'4': 31, '3': 41, '@': 33, '_': 45, '1': 41, '0': 48},
 'N': {'0': 74, '3': 83, '@': 37, '4': 42, '_': 61, '1': 57},
 'O': {'0': 397, '3': 289, '@': 134, '1': 265, '4': 146, '_': 211},
 'P': {'4': 30, '3': 36, '_': 38, '1': 33, '@': 48

In [60]:
#load and pre-process test data
profanity_df = pd.read_csv('profanity_en.csv')
profanity_df.head(10)

Unnamed: 0,text,canonical_form_1,canonical_form_2,canonical_form_3,category_1,category_2,category_3,severity_rating,severity_description
0,69,69,,,sexual anatomy / sexual acts,,,1.0,Mild
1,@55,ass,,,sexual anatomy / sexual acts,,,1.0,Mild
2,@ssfcker,fuck,ass,,sexual anatomy / sexual acts,sexual orientation / gender,,2.8,Severe
3,@ssfucker,fuck,ass,,sexual anatomy / sexual acts,sexual orientation / gender,,2.8,Severe
4,@ssfvcker,fuck,ass,,sexual anatomy / sexual acts,sexual orientation / gender,,2.4,Strong
5,@sshole,ass,,,sexual anatomy / sexual acts,,,1.6,Strong
6,0ral seks,sex,,,sexual anatomy / sexual acts,,,1.0,Mild
7,0ral sex,sex,,,sexual anatomy / sexual acts,,,1.8,Strong
8,0rg@sm,orgasm,,,sexual anatomy / sexual acts,,,1.0,Mild
9,0rgasms,orgasm,,,sexual anatomy / sexual acts,,,1.0,Mild


In [61]:
type(profanity_df['text'][2])

str

In [62]:
# Use the frequent pattern list to detect and de-leetify the leet words from profanity csv
import re
# Use the existing leetDict to get substitutions and leet transformations for new reply
def valid_answer(txt1, txt2, txt3, ans):
    cond1 = isinstance(txt1, str) and txt1.upper() in ans.upper()
    cond2 = isinstance(txt2, str) and txt2.upper() in ans.upper()
    cond3 = isinstance(txt3, str) and txt3.upper() in ans.upper()

    return cond1 or cond2 or cond3

test_result = {'text':[], 'canonical_form_1':[], 'canonical_form_2':[], 'canonical_form_3':[], 'minedLeet':[], 'BestMatches':[], 'correct': []}

for i in trange(len(profanity_df['text'].values)):
    test_result['text'].append(profanity_df['text'].values[i])
    test_result['canonical_form_1'].append(profanity_df['canonical_form_1'].values[i])
    test_result['canonical_form_2'].append(profanity_df['canonical_form_2'].values[i])
    test_result['canonical_form_3'].append(profanity_df['canonical_form_3'].values[i])

    leetWords = lm.getLeetWordList(profanity_df['text'].values[i])
    test_result['minedLeet'].append(leetWords)
    bestMatches = []
    corrects = []

    for lword in leetWords:
        bestMatch = lm.getBestMatch(lword, leet_fp)
        bestMatches.append(bestMatch)
        if valid_answer(profanity_df['canonical_form_1'].values[i], profanity_df['canonical_form_2'].values[i], profanity_df['canonical_form_3'].values[i], bestMatch):
            corrects.append("yes")
        else:
            corrects.append("no")
        
    if len(bestMatches) > 0:
        test_result['BestMatches'].append(bestMatches)
        test_result['correct'].append(corrects)
    else:
        test_result['BestMatches'].append([])
        test_result['correct'].append([])

test_result_df = pd.DataFrame(test_result)
test_result_df.head()

100%|██████████| 1598/1598 [01:56<00:00, 13.67it/s]


Unnamed: 0,text,canonical_form_1,canonical_form_2,canonical_form_3,minedLeet,BestMatches,correct
0,69,69,,,[],[],[]
1,@55,ass,,,[],[],[]
2,@ssfcker,fuck,ass,,[@ssfcker],[Assfcker],[yes]
3,@ssfucker,fuck,ass,,[@ssfucker],[Assfucker],[yes]
4,@ssfvcker,fuck,ass,,[@ssfvcker],[Assfvcker],[yes]


In [102]:
# null = test_result_df[test_result_df['correct'] !='yes']
# null = null[null['correct'] !='no']
# null = null[null['word'].str.len() >= 15]
#test_result_df['correct']
#test_result_df[test_result_df['correct'].values is None]
#'yes' in test_result_df['correct'].values[0]
test_result_df[test_result_df['minedLeet'].str.len()==0] 


Unnamed: 0,text,canonical_form_1,canonical_form_2,canonical_form_3,minedLeet,BestMatches,correct
0,69,69,,,[],[],[]
1,@55,ass,,,[],[],[]
19,abbie,abraham,,,[],[],[]
20,abeed,abeed,,,[],[],[]
21,aboe,abo,,,[],[],[]
...,...,...,...,...,...,...,...
1593,wnker,wank,,,[],[],[]
1594,wop,wop,,,[],[],[]
1595,wophead,wop,,,[],[],[]
1596,zip in the wire,zipperhead,,,[],[],[]


In [103]:
test_result_df[test_result_df['correct'].str.len()>0] 

Unnamed: 0,text,canonical_form_1,canonical_form_2,canonical_form_3,minedLeet,BestMatches,correct
2,@ssfcker,fuck,ass,,[@ssfcker],[Assfcker],[yes]
3,@ssfucker,fuck,ass,,[@ssfucker],[Assfucker],[yes]
4,@ssfvcker,fuck,ass,,[@ssfvcker],[Assfvcker],[yes]
5,@sshole,ass,,,[@sshole],[Asshole],[yes]
6,0ral seks,sex,,,[0ral],[Oral],[no]
...,...,...,...,...,...,...,...
1574,we1back,wetback,,,[we1back],[wetback],[yes]
1580,wh0r3,whore,,,[wh0r3],[whOrE],[yes]
1581,wh0re,whore,,,[wh0re],[whOre],[yes]
1586,whor3,whore,,,[whor3],[whorE],[yes]


In [104]:
count_yes = 0
count_no = 0
count_none = 0
for k in trange(len(test_result_df['text'].values)):
    if test_result_df['correct'].values[k].count('yes') > 0:
        count_yes+=test_result_df['correct'].values[k].count('yes')
    if test_result_df['correct'].values[k].count('no') > 0:
        count_no+=test_result_df['correct'].values[k].count('no')
    if len(test_result_df['correct'].values[k]) ==0:
        count_none+=1

# len(test_result_df['correct'].values[0])
print("Yes:" , count_yes)
print("No:" , count_no)
print("None:" , count_none)

100%|██████████| 1598/1598 [00:00<00:00, 91327.13it/s]

Yes: 200
No: 111
None: 1288





In [None]:
test_result_df.to_csv('profanity_test.csv')

In [132]:
#profanity list is a mix of leet and non-leet words, the none result is from non-leetwords, no is from phrases with leet but canonical form is incomplete, sometimes words are mix of different camouflaging techniques: misspell, leet, punctuations etc.

'A'