In [1]:
from nltk.lm.preprocessing import padded_everygram_pipeline
from sklearn.linear_model import SGDClassifier
from nltk.util import bigrams
from nltk.lm import Laplace
from nltk.lm import KneserNeyInterpolated
import nltk 
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import re
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexandernielsen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
FILE_PATH = 'data/hate/train_text.txt'

In [3]:
def tokenize_lines_list(file_path):
    line_list = []
    with open(file_path, 'r', encoding = 'utf-8') as f:
        lines = f.readlines()

    for line in lines:
        tokens = word_tokenize(line)
        line_list.append(tokens)
    
    return line_list

In [4]:
tokenized_lines = tokenize_lines_list(FILE_PATH)
training_size = (len(tokenized_lines) // 5) * 4 #80% of the original dataset

training_data = tokenized_lines[:training_size] #slice to training data
validation_data = tokenized_lines[- (len(tokenized_lines) - training_size):]
# for line in tokenized_lines[:training_size]:
#     training_data.extend(line)

In [5]:
all_tokens = training_data
all_test_tokens = validation_data

list_of_ones = set()
my_dict = {}
tokens = []

for i in all_tokens:
    for j in i:
        tokens.append(j)

for i in tokens:
    my_dict[i] = 0

for i in tokens:
    if i in my_dict:
        my_dict[i] += 1

for i in my_dict:
    if my_dict[i] == 1:
        list_of_ones.add(i)

for i in all_tokens:
    for j in i:
        if j in list_of_ones:
            j = '<UNK>'

In [6]:
print("list of ones: ",len(list_of_ones))
print("my dictionary: ",len(my_dict))
print("regular tokens: ",len(tokens))
print("all tokens: ", len(all_tokens))
print("all test tokens: ", len(all_test_tokens))

list of ones:  12548
my dictionary:  20674
regular tokens:  180683
all tokens:  7200
all test tokens:  1800


In [7]:
print(tokens[0])
print(all_tokens[0])

@
['@', 'user', 'nice', 'new', 'signage', '.', 'Are', 'you', 'not', 'concerned', 'by', 'Beatlemania', '-style', 'hysterical', 'crowds', 'crongregating', 'on', 'you…']


In [8]:
# training corpus 
all_strings = '\n'.join([' '.join(e) for e in all_tokens])
corpus = all_strings.lower().split('\n')
vocabulary = word_tokenize(all_strings)

In [9]:
corpus

['@ user nice new signage . are you not concerned by beatlemania -style hysterical crowds crongregating on you…',
 'a woman who you fucked multiple times saying yo dick small is a compliment you know u hit that spot 😎',
 '@ user @ user real talk do you have eyes or were they gouged out by a rapefugee ?',
 'your girlfriend lookin at me like a groupie in this bitch !',
 'hysterical woman like @ user',
 'me flirting- so tell me about your father ...',
 "the philippine catholic bishops ' work for migrant workers should focus on families who are `` paying the great ...",
 "i am not going after your ex bf you lieing sack of shit ! i 'm done with you dude that 's why i dumped your ass cause your a lieing 😂😡 bitch",
 'when cuffin season is finally over',
 'send home migrants not in need of protection , peter dutton tells un , hey dutton how about the ones that have stayed and not left the country when they should overstayers ? why dont you go and round all them up ?',
 'cory booker and kamala 

In [10]:
vocabulary

['@',
 'user',
 'nice',
 'new',
 'signage',
 '.',
 'Are',
 'you',
 'not',
 'concerned',
 'by',
 'Beatlemania',
 '-style',
 'hysterical',
 'crowds',
 'crongregating',
 'on',
 'you…',
 'A',
 'woman',
 'who',
 'you',
 'fucked',
 'multiple',
 'times',
 'saying',
 'yo',
 'dick',
 'small',
 'is',
 'a',
 'compliment',
 'you',
 'know',
 'u',
 'hit',
 'that',
 'spot',
 '😎',
 '@',
 'user',
 '@',
 'user',
 'real',
 'talk',
 'do',
 'you',
 'have',
 'eyes',
 'or',
 'were',
 'they',
 'gouged',
 'out',
 'by',
 'a',
 'rapefugee',
 '?',
 'your',
 'girlfriend',
 'lookin',
 'at',
 'me',
 'like',
 'a',
 'groupie',
 'in',
 'this',
 'bitch',
 '!',
 'Hysterical',
 'woman',
 'like',
 '@',
 'user',
 'Me',
 'flirting-',
 'So',
 'tell',
 'me',
 'about',
 'your',
 'father',
 '...',
 'The',
 'Philippine',
 'Catholic',
 'bishops',
 "'",
 'work',
 'for',
 'migrant',
 'workers',
 'should',
 'focus',
 'on',
 'families',
 'who',
 'are',
 '``',
 'paying',
 'the',
 'great',
 '...',
 'I',
 'AM',
 'NOT',
 'GOING',
 'AFTER'

In [11]:
train, vocab = padded_everygram_pipeline(2, all_tokens)
lm = Laplace(2)
lm.fit(train, vocab) # will result in 'ZeroDivisionError' if removed

In [15]:
from tqdm import tqdm
list_of_perplexity = []
list_of_unigrams = []
lower_tokens = []

for sentence in all_tokens: 
    for word in sentence: 
        lower_tokens.append(word.lower())

# creating ngram list from training tokens
for i in all_test_tokens:
    list_of_unigrams.append(list(bigrams(i)))

# removing empty strings from unigrams
for i in list_of_unigrams: 
    if len(i) == 0:
        list_of_unigrams.remove(i)

# creating a list of perplexities
for i in tqdm(list_of_unigrams): 
    list_of_perplexity.append(lm.perplexity(i))

results = pd.DataFrame(list(zip(list_of_unigrams, list_of_perplexity)),columns = ['bigram grouped sentences','perplexity'])

100%|██████████| 1799/1799 [00:00<00:00, 7381.78it/s]


In [20]:
results['percentage'] = results['perplexity'] / len(my_dict)

In [21]:
results

Unnamed: 0,bigram grouped sentences,perplexity,percentage
0,"[(#, WorldCup), (WorldCup, Blog), (Blog, for),...",6974.947055,0.337378
1,"[(The, ðŸ‡ªðŸ‡ºEU), (ðŸ‡ªðŸ‡ºEU, discuss), (di...",9239.682076,0.446923
2,"[(@, user), (user, Thank), (Thank, you), (you,...",2640.115827,0.127702
3,"[(@, user), (user, @), (@, user), (user, Pay),...",6311.729127,0.305298
4,"[(Prom, is), (is, coming), (coming, up), (up, ...",3894.682634,0.188386
...,...,...,...
1794,"[(Oooohhhh, bitch), (bitch, did), (did, n't), ...",5112.868825,0.247309
1795,"[(@, user), (user, Good), (Good, Luck), (Luck,...",3139.448968,0.151855
1796,"[(Bitch, you), (you, ca), (ca, n't), (n't, kee...",3397.078821,0.164316
1797,"[(@, user), (user, @), (@, user), (user, @), (...",1242.730552,0.060111
