In [76]:
#Import libraries
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import string 
from nltk.stem import WordNetLemmatizer

In [77]:
import pandas as pd
df1 = pd.read_csv("cleanprojectdataset.csv")

In [78]:
print(df1)

                                                  Tweet    Text Label
0     .omg why are poc wearing fugly blue contacts s...  Non-Bullying
1     .Sorry but most of the runners popular right n...  Non-Bullying
2     .those jeans are hideous, and I?m afraid he?s ...  Non-Bullying
3     .I had to dress up for a presentation in class...  Non-Bullying
4     .Am I the only one who thinks justin bieber is...  Non-Bullying
...                                                 ...           ...
1060  No we are not, But you are a race baiting libt...      Bullying
1061  you wont get anyone for this challenge., after...      Bullying
1062  I will follow you if you are not a libtard,Mus...      Bullying
1063  michaelianblack Ur a child, an ostrich w/ your...      Bullying
1064  FoxNews. not to all the ppl I know that live t...      Bullying

[1065 rows x 2 columns]


In [79]:
#Tokenize words and labels into lists
Tweet = []
Labels = []

for row in df1["Tweet"]:
    #tokenize words
    words = word_tokenize(row)
    #remove punctuations
    clean_words = [word.lower() for word in words if word not in set(string.punctuation)]
    #remove stop words
    english_stops = set(stopwords.words('english'))
    characters_to_remove = ["''",'``',"rt","https","’","“","”","\u200b","--","n't","'s","...","//t.c" ]
    clean_words = [word for word in clean_words if word not in english_stops]
    clean_words = [word for word in clean_words if word not in set(characters_to_remove)]
    #Lematise words
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma_list = [wordnet_lemmatizer.lemmatize(word) for word in clean_words]
    Tweet.append(lemma_list)

    for row in df1["Text Label"]:
        Labels.append(row)

In [80]:
#combine them to create bag of words
combined = zip(Tweet, Labels)

In [81]:
#Create bag of words and dictionary object
def bag_of_words(words):
    return dict([(word, True) for word in words])

In [82]:
#Key, Value Pair into new list for modeling
Final_Data = []
for r, v in combined:
    bag_of_words(r)
    Final_Data.append((bag_of_words(r),v))

In [83]:
#random shuffle
import random
random.shuffle(Final_Data)
print(len(Final_Data))

1065


In [84]:
#Split the data into training set and testing 60/40 split
train_set, test_set = Final_Data[0:746], Final_Data[746:]

In [85]:
#import confusion matrix metrics and run Naive Bayes with Unigrams
import nltk
import collections
from nltk.metrics.scores import (accuracy, precision, recall, f_measure) 
from nltk import metrics

#find accuracy
refsets = collections. defaultdict(set)
testsets = collections.defaultdict(set)

classifier = nltk.NaiveBayesClassifier.train(train_set)

for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print("Naive Bayes Performance with Unigrams ")    
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))

Naive Bayes Performance with Unigrams 
Accuracy: 0.6394984326018809


In [86]:
#find recall
nb_classifier = nltk.NaiveBayesClassifier.train(train_set)

nbrefset = collections.defaultdict(set)
nbtestset = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_set):
    nbrefset[label].add(i)
    observed = nb_classifier.classify(feats)
    nbtestset[observed].add(i)
    
print("UnigramNB Recall")
print('Bullying recall:', recall(nbtestset['Bullying'], nbrefset['Bullying']))

UnigramNB Recall
Bullying recall: 0.5542168674698795



In [87]:
#find most informative features
classifier.show_most_informative_features(n=10)

Most Informative Features
                    dumb = True           Bullyi : Non-Bu =      8.7 : 1.0
                   pussy = True           Bullyi : Non-Bu =      6.0 : 1.0
                  retard = True           Bullyi : Non-Bu =      6.0 : 1.0
               worthless = True           Bullyi : Non-Bu =      5.9 : 1.0
                 someone = True           Non-Bu : Bullyi =      5.9 : 1.0
                     big = True           Bullyi : Non-Bu =      5.6 : 1.0
                    last = True           Bullyi : Non-Bu =      5.6 : 1.0
               prejudice = True           Non-Bu : Bullyi =      5.6 : 1.0
                     low = True           Bullyi : Non-Bu =      5.4 : 1.0
                   loser = True           Bullyi : Non-Bu =      5.2 : 1.0


In [33]:
#Run Decision Tree for Unigrams to find recall
from nltk.classify import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier.train(train_set, 
                                             binary=True, 
                                             entropy_cutoff=0.8, 
                                             depth_cutoff=5, 
                                             support_cutoff=30)
refset = collections.defaultdict(set)
testset = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = dt_classifier.classify(feats)
    testset[observed].add(i)
print("UnigramDT Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")

UnigramDT Recall
Bullying recall: 0.7083333333333334



In [34]:
#Run Maxent Classifier for Unigrams
from nltk.classify import MaxentClassifier

logit_classifier = MaxentClassifier.train(train_set, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)

for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = logit_classifier.classify(feats)
    testset[observed].add(i)
print("UnigramsLogit Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")
 

UnigramsLogit Recall
Bullying recall: 0.6989247311827957



In [35]:
#Run Support Vector Machine for Unigrams
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
SVM_classifier = SklearnClassifier(SVC(gamma='auto'), sparse=False).train(train_set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = SVM_classifier.classify(feats)
    testset[observed].add(i)
    
print("UniigramSVM Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))

UniigramSVM Recall
Bullying recall: 0.6989247311827957


In [36]:
#Do the same thing with bigrams
from nltk import bigrams, trigrams
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

In [37]:
combined = zip(Tweet,Labels)

In [38]:
#Bag of words for bigrams
def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)  
    bigrams = bigram_finder.nbest(score_fn, n)  
    return bag_of_words(bigrams)

In [39]:
Final_Data2 =[]

for z, e in combined:
    bag_of_bigrams_words(z)
    Final_Data2.append((bag_of_bigrams_words(z),e))

In [40]:
import random
random.shuffle(Final_Data2)
print(len(Final_Data2))

1065


In [41]:
#split data again around 60/40
train_set, test_set = Final_Data2[0:747], Final_Data2[747:]

In [42]:
#Naive Bayes for Bigrams
import nltk
import collections
from nltk.metrics.scores import (accuracy, precision, recall, f_measure) 
from nltk import metrics



refsets = collections. defaultdict(set)
testsets = collections.defaultdict(set)

classifier = nltk.NaiveBayesClassifier.train(train_set)

 
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
    
#Accuracy

print("Naive Bayes Performance with Bigrams ")    
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))

Naive Bayes Performance with Bigrams 
Accuracy: 0.6163522012578616


In [43]:
#Informative Features for Bigrams
classifier.show_most_informative_features(n=10)

Most Informative Features
       ('piece', 'shit') = True           Bullyi : Non-Bu =     15.3 : 1.0
           ('low', 'iq') = True           Bullyi : Non-Bu =     14.7 : 1.0
  ('worthless', 'piece') = True           Bullyi : Non-Bu =     10.0 : 1.0
        ('fuck', 'goat') = True           Bullyi : Non-Bu =      2.6 : 1.0
      ('nobody', 'care') = True           Bullyi : Non-Bu =      2.4 : 1.0
     ('fucking', 'cunt') = True           Bullyi : Non-Bu =      2.2 : 1.0
           ('i\\', "'m") = True           Non-Bu : Bullyi =      1.9 : 1.0
           ('iq', 'low') = True           Bullyi : Non-Bu =      1.6 : 1.0
('1', 'low-iq-arabists') = True           Bullyi : Non-Bu =      1.6 : 1.0
        ('idiot', 'fag') = True           Bullyi : Non-Bu =      1.6 : 1.0


In [44]:
#Decision Tree for Bigrams
from nltk.classify import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier.train(train_set, 
                                             binary=True, 
                                             entropy_cutoff=0.8, 
                                             depth_cutoff=5, 
                                             support_cutoff=30)
refset = collections.defaultdict(set)
testset = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = dt_classifier.classify(feats)
    testset[observed].add(i)
print("BigramDT Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")

BigramDT Recall
Bullying recall: 0.7241379310344828



In [45]:
#Maxent Classifier for Bigrams
from nltk.classify import MaxentClassifier

logit_classifier = MaxentClassifier.train(train_set, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)

for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = logit_classifier.classify(feats)
    testset[observed].add(i)
print("BigramsLogit Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")

BigramsLogit Recall
Bullying recall: 0.6491228070175439



In [46]:
#Support Vecotr Machine for Bigrams
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
SVM_classifier = SklearnClassifier(SVC(gamma='auto'), sparse=False).train(train_set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = SVM_classifier.classify(feats)
    testset[observed].add(i)
    
print("Bigrams Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))

Bigrams Recall
Bullying recall: 0.6491228070175439


In [47]:
combined = zip(Tweet,Labels)

In [48]:
#Same thing with Trigrams
from nltk import bigrams, trigrams
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures

#Bag of words for Trigrams
def bag_of_trigrams_words(words, score_fn=TrigramAssocMeasures.chi_sq, n=200):
    trigram_finder = TrigramCollocationFinder.from_words(words)  
    trigrams = trigram_finder.nbest(score_fn, n)  
    return bag_of_words(trigrams)

In [49]:
#Final list for modeling
Final_Data3 =[]

for z, e in combined:
    bag_of_trigrams_words(z)
    Final_Data3.append((bag_of_trigrams_words(z),e))

import random
random.shuffle(Final_Data3)
print(len(Final_Data3))


1065


In [50]:
#60/40
train_set, test_set = Final_Data3[0:747], Final_Data3[747:]

In [51]:
#Naive Bayes for Trigrams
import nltk
import collections
from nltk.metrics.scores import (accuracy, precision, recall, f_measure) 


refsets = collections. defaultdict(set)
testsets = collections.defaultdict(set)

classifier = nltk.NaiveBayesClassifier.train(train_set)

 
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

In [52]:
#Accuracy
print("Naive Bayes Performance with Trigrams ")    
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))

Naive Bayes Performance with Trigrams 
Accuracy: 0.6037735849056604


In [53]:
#Metrics
print('bullying precision:', precision(refsets['Bullying'], testsets['Bullying']))
print('bullying recall:', recall(refsets['Bullying'], testsets['Bullying']))

bullying precision: 0.7142857142857143
bullying recall: 0.03875968992248062


In [54]:
#Most informative features for Trigrams
classifier.show_most_informative_features(n=10)

Most Informative Features
('worthless', 'piece', 'shit') = True           Bullyi : Non-Bu =      9.5 : 1.0
('low-iq-arabists', 'trying', 'smear') = True           Bullyi : Non-Bu =      1.5 : 1.0
("don\\'t", 'identify', 'w/arabism') = True           Bullyi : Non-Bu =      1.5 : 1.0
('smear', 'racist', 'minority') = True           Bullyi : Non-Bu =      1.5 : 1.0
('racist', 'minority', "don\\'t") = True           Bullyi : Non-Bu =      1.5 : 1.0
('w/arabism', 'clueless', '+not') = True           Bullyi : Non-Bu =      1.5 : 1.0
('also', 'beating', 'woman') = True           Bullyi : Non-Bu =      1.5 : 1.0
('another', 'low', 'iq') = True           Bullyi : Non-Bu =      1.5 : 1.0
('identify', 'w/arabism', 'clueless') = True           Bullyi : Non-Bu =      1.5 : 1.0
('beating', 'woman', 'idiot') = True           Bullyi : Non-Bu =      1.5 : 1.0


In [55]:
#Decision Tree for Trigrams
from nltk.classify import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier.train(train_set, 
                                             binary=True, 
                                             entropy_cutoff=0.8, 
                                             depth_cutoff=5, 
                                             support_cutoff=30)
refset = collections.defaultdict(set)
testset = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = dt_classifier.classify(feats)
    testset[observed].add(i)
print("TrigramDT Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")

TrigramDT Recall
Bullying recall: 0.75



In [56]:
#Maxent Classifier for Trigrams
from nltk.classify import MaxentClassifier

logit_classifier = MaxentClassifier.train(train_set, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)

for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = logit_classifier.classify(feats)
    testset[observed].add(i)
print("TrigramsLogit Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")

TrigramsLogit Recall
Bullying recall: 0.7142857142857143



In [57]:
#Support Vector Machine for Trigrams
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
SVM_classifier = SklearnClassifier(SVC(gamma='auto'), sparse=False).train(train_set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = SVM_classifier.classify(feats)
    testset[observed].add(i)
    
print("Trigrams Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))

Trigrams Recall
Bullying recall: 0.7142857142857143


In [58]:
combined = zip(Tweet,Labels)

In [59]:
#Combining Unigrams, Bigrams, and Trigrams for (N=3) modeling

# Import Bigram metrics - we will use these to identify the top 200 trigrams
def bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq,
n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bigrams

from nltk.collocations import TrigramCollocationFinder

# Import Trigram metrics - we will use these to identify the top 200 trigrams
from nltk.metrics import TrigramAssocMeasures

def trigrams_words(words, score_fn=TrigramAssocMeasures.chi_sq,
n=200):
    trigram_finder = TrigramCollocationFinder.from_words(words)
    trigrams = trigram_finder.nbest(score_fn, n)
    return trigrams

#Combined
def bag_of_Ngrams_words(words):
    bigramBag = bigrams_words(words)
    
    #The following two for loops convert tuple into string
    for b in range(0,len(bigramBag)):
        bigramBag[b]=' '.join(bigramBag[b])
   
    trigramBag = trigrams_words(words)
    for t in range(0,len(trigramBag)):
        trigramBag[t]=' '.join(trigramBag[t])
        
 #New bag of words

    return bag_of_words(trigramBag + bigramBag + words)

In [60]:
Final_Data4 =[]

for z, e in combined:
    bag_of_Ngrams_words(z)
    Final_Data4.append((bag_of_Ngrams_words(z),e))

In [61]:
#Naive Bayes for Ngrams
import random
random.shuffle(Final_Data4)
print(len(Final_Data4))

train_set, test_set = Final_Data4[0:747], Final_Data4[747:]

import nltk
import collections
from nltk.metrics.scores import (accuracy, precision, recall, f_measure) 


refsets = collections. defaultdict(set)
testsets = collections.defaultdict(set)

classifier = nltk.NaiveBayesClassifier.train(train_set)

 
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

#Accuracy
print("Naive Bayes Performance with Ngrams ")    
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))


1065
Naive Bayes Performance with Ngrams 
Accuracy: 0.6729559748427673


In [62]:
#Informative features for Ngrams
classifier.show_most_informative_features(n=10)

Most Informative Features
              piece shit = True           Bullyi : Non-Bu =      9.9 : 1.0
                  low iq = True           Bullyi : Non-Bu =      8.0 : 1.0
                 libtard = True           Bullyi : Non-Bu =      7.5 : 1.0
                   piece = True           Bullyi : Non-Bu =      7.5 : 1.0
         worthless piece = True           Bullyi : Non-Bu =      6.9 : 1.0
                  stupid = True           Bullyi : Non-Bu =      6.5 : 1.0
                   sorry = True           Bullyi : Non-Bu =      6.5 : 1.0
    worthless piece shit = True           Bullyi : Non-Bu =      6.3 : 1.0
                     low = True           Bullyi : Non-Bu =      6.3 : 1.0
               worthless = True           Bullyi : Non-Bu =      6.2 : 1.0


In [63]:
print('bullying precision:', precision(refsets['Bullying'], testsets['Bullying']))
print('bullying recall:', recall(refsets['Bullying'], testsets['Bullying']))

bullying precision: 0.5740740740740741
bullying recall: 0.7265625


In [64]:
#Decision Tree for Ngrams
from nltk.classify import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier.train(train_set, 
                                             binary=True, 
                                             entropy_cutoff=0.8, 
                                             depth_cutoff=5, 
                                             support_cutoff=30)
refset = collections.defaultdict(set)
testset = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = dt_classifier.classify(feats)
    testset[observed].add(i)
print("NgramDT Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")

NgramDT Recall
Bullying recall: 0.7454545454545455



In [65]:
#Maxent Classifier, Logistic Regression for Ngrams
from nltk.classify import MaxentClassifier

logit_classifier = MaxentClassifier.train(train_set, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)

for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = logit_classifier.classify(feats)
    testset[observed].add(i)
print("NgramsLogit Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")

NgramsLogit Recall
Bullying recall: 0.5966386554621849



In [66]:
#Support Vector Machine for Ngrams
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
SVM_classifier = SklearnClassifier(SVC(gamma='auto'), sparse=False).train(train_set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = SVM_classifier.classify(feats)
    testset[observed].add(i)
    
print("Ngrams Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))

Ngrams Recall
Bullying recall: 0.5966386554621849


In [67]:
train_set, test_set = Final_Data[0:747], Final_Data[747:]

import nltk
import collections
from nltk.metrics.scores import (accuracy, precision, recall, f_measure)
nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
nb_classifier.show_most_informative_features(10)

from nltk.classify.util import accuracy
print(accuracy(nb_classifier, test_set))

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
    
for i, (Final_Data, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = nb_classifier.classify(Final_Data)
    testsets[observed].add(i)
    
print('bullying precision:', precision(refsets['Bullying'], testsets['Bullying']))
print('bullying recall:', recall(refsets['Bullying'], testsets['Bullying']))
print('bullying F-measure:', f_measure(refsets['Bullying'], testsets['Bullying']))
print('not-bullying precision:', precision(refsets['Non-Bullying'], testsets['Non-Bullying']))
print('not-bullying recall:', recall(refsets['Non-Bullying'], testsets['Non-Bullying']))
print('not-bullying F-measure:', f_measure(refsets['Non-Bullying'], testsets['Non-Bullying']))

Most Informative Features
                   piece = True           Bullyi : Non-Bu =     10.2 : 1.0
               worthless = True           Bullyi : Non-Bu =      8.9 : 1.0
                 libtard = True           Bullyi : Non-Bu =      7.7 : 1.0
                     low = True           Bullyi : Non-Bu =      6.4 : 1.0
                   lying = True           Bullyi : Non-Bu =      5.6 : 1.0
                   whole = True           Bullyi : Non-Bu =      5.6 : 1.0
                      ur = True           Bullyi : Non-Bu =      5.6 : 1.0
                  retard = True           Bullyi : Non-Bu =      5.5 : 1.0
                      iq = True           Bullyi : Non-Bu =      5.0 : 1.0
                    stop = True           Non-Bu : Bullyi =      5.0 : 1.0
0.6572327044025157
bullying precision: 0.581081081081081
bullying recall: 0.6466165413533834
bullying F-measure: 0.6120996441281138
not-bullying precision: 0.7235294117647059
not-bullying recall: 0.6648648648648648
not-bully