In [82]:
import pandas as pd
from sqlalchemy import create_engine
from string import digits
import numpy as np
import os
import nltk
import string
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI


In [83]:
csv1 = "train_E6oV3lV.csv"
# read CSV
df = pd.read_csv(csv1, encoding = 'unicode_escape',error_bad_lines=False)
# df2 = pd.read_csv(csv2)
df.head()


Unnamed: 0,id,label,tweet
0,14,Hate Detected,@user #cnn calls #michigan middle school 'buil...
1,15,Hate Detected,no comment! in #australia #opkillingbay #se...
2,18,Hate Detected,retweet if you agree!
3,24,Hate Detected,@user @user lumpy says i am a . prove it lumpy.
4,35,Hate Detected,it's unbelievable that in the 21st century we'...


In [84]:
df['tweet'] = df['tweet'].astype(pd.StringDtype())
df.dtypes

id        int64
label    object
tweet    string
dtype: object

In [85]:
#download stop words 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/williammdavis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [86]:
def process_text(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean_words

In [87]:
#show tokenization 
df['tweet'].head().apply(process_text)

0    [user, cnn, calls, michigan, middle, school, b...
1    [comment, australia, opkillingbay, seashepherd...
2                                     [retweet, agree]
3              [user, user, lumpy, says, prove, lumpy]
4    [unbelievable, 21st, century, wed, need, somet...
Name: tweet, dtype: object

In [88]:
#convert collection of text to a matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer
count_v  = CountVectorizer(analyzer=process_text)
message_bow = count_v.fit_transform(df['tweet'])

In [89]:
print(message_bow)

  (0, 11206)	1
  (0, 2272)	1
  (0, 1822)	1
  (0, 6903)	1
  (0, 6908)	1
  (0, 9330)	1
  (0, 1726)	1
  (0, 11432)	1
  (0, 2045)	1
  (0, 10468)	1
  (1, 2360)	1
  (1, 974)	1
  (1, 7751)	1
  (1, 9384)	1
  (1, 4982)	2
  (1, 10596)	1
  (2, 9032)	1
  (2, 474)	1
  (3, 11206)	2
  (3, 6536)	2
  (3, 9309)	1
  (3, 8519)	1
  (4, 11092)	1
  (4, 145)	1
  (4, 2011)	1
  :	:
  (4923, 6266)	1
  (4924, 11206)	2
  (4924, 6536)	1
  (4924, 3230)	1
  (4924, 9307)	1
  (4924, 10791)	1
  (4924, 1097)	1
  (4924, 1006)	1
  (4924, 4877)	1
  (4924, 11509)	1
  (4924, 3081)	1
  (4925, 6610)	1
  (4925, 10569)	1
  (4925, 4543)	2
  (4925, 884)	1
  (4925, 7682)	1
  (4925, 752)	1
  (4926, 11206)	1
  (4926, 9661)	1
  (4926, 10527)	1
  (4926, 11251)	1
  (4926, 1815)	1
  (4926, 11882)	1
  (4926, 2430)	1
  (4926, 371)	1


In [98]:
#split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(message_bow, df['label'],test_size=0.20, random_state=0)

In [99]:
message_bow.shape

(4927, 12625)

In [100]:
#create and train the naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
classifier =  MultinomialNB().fit(X_train, y_train)

In [101]:
#print the prediction
print(classifier.predict(X_train))
#print values
print(y_train.values)

['Hate Detected ' 'No Hate Detected' 'No Hate Detected' ...
 'No Hate Detected' 'No Hate Detected' 'No Hate Detected']
['Hate Detected ' 'No Hate Detected' 'No Hate Detected' ...
 'No Hate Detected' 'No Hate Detected' 'No Hate Detected']


In [102]:
#Evaluate the model on the training data set 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
prediction = classifier.predict(X_train)
print(classification_report(y_train, prediction))
print()

                  precision    recall  f1-score   support

  Hate Detected        0.96      0.97      0.97      1778
No Hate Detected       0.98      0.97      0.97      2163

        accuracy                           0.97      3941
       macro avg       0.97      0.97      0.97      3941
    weighted avg       0.97      0.97      0.97      3941




In [103]:
print('Confusion Matrix: \n', confusion_matrix(y_train,prediction))
print('Accuracy Matrix:',  accuracy_score(y_train,prediction))

Confusion Matrix: 
 [[1733   45]
 [  72 2091]]
Accuracy Matrix: 0.9703121035270236


In [104]:
#print the prediction
print(classifier.predict(X_test))
#print values
print(y_test.values)


['Hate Detected ' 'No Hate Detected' 'No Hate Detected' 'Hate Detected '
 'Hate Detected ' 'No Hate Detected' 'Hate Detected ' 'No Hate Detected'
 'No Hate Detected' 'Hate Detected ' 'Hate Detected ' 'No Hate Detected'
 'Hate Detected ' 'Hate Detected ' 'Hate Detected ' 'No Hate Detected'
 'Hate Detected ' 'No Hate Detected' 'No Hate Detected' 'Hate Detected '
 'Hate Detected ' 'Hate Detected ' 'No Hate Detected' 'No Hate Detected'
 'No Hate Detected' 'Hate Detected ' 'Hate Detected ' 'Hate Detected '
 'No Hate Detected' 'Hate Detected ' 'Hate Detected ' 'No Hate Detected'
 'No Hate Detected' 'Hate Detected ' 'Hate Detected ' 'No Hate Detected'
 'Hate Detected ' 'Hate Detected ' 'No Hate Detected' 'Hate Detected '
 'Hate Detected ' 'No Hate Detected' 'No Hate Detected' 'No Hate Detected'
 'No Hate Detected' 'No Hate Detected' 'Hate Detected ' 'No Hate Detected'
 'Hate Detected ' 'Hate Detected ' 'No Hate Detected' 'No Hate Detected'
 'No Hate Detected' 'No Hate Detected' 'Hate Detected

In [105]:
#Evaluate the model on the training data set 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
prediction = classifier.predict(X_test)
print(classification_report(y_test, prediction))
print()
print('Confusion Matrix: \n', confusion_matrix(y_test,prediction))
print()
print('Accuracy Matrix:',  accuracy_score(y_test,prediction))

                  precision    recall  f1-score   support

  Hate Detected        0.85      0.88      0.86       464
No Hate Detected       0.89      0.86      0.87       522

        accuracy                           0.87       986
       macro avg       0.87      0.87      0.87       986
    weighted avg       0.87      0.87      0.87       986


Confusion Matrix: 
 [[407  57]
 [ 73 449]]

Accuracy Matrix: 0.8681541582150102


In [106]:
nо_hate_example = ['flowers are pretty']
input_bow = count_v.transform(example)
print(input_bow)
# classifier =  MultinomialNB().fit(X1_train, y1_train)
classifier.predict(input_bow)

  (0, 4140)	1
  (0, 8384)	1


array(['No Hate Detected'], dtype='<U16')

In [107]:
hate_example = ['get rid of africans']
input_bow = count_v.transform(hate_example)
print(input_bow)
# classifier =  MultinomialNB().fit(X1_train, y1_train)
classifier.predict(input_bow)

  (0, 452)	1
  (0, 4461)	1
  (0, 9071)	1


array(['Hate Detected '], dtype='<U16')