In [78]:
import pandas as pd
from sqlalchemy import create_engine
from string import digits
import numpy as np
import os
import nltk
import string
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt


In [79]:
csv = "train_E6oV3lV.csv"
# read CSV
df = pd.read_csv(csv, encoding = 'unicode_escape',error_bad_lines=False)
df.head()


Unnamed: 0,id,label,tweet
0,14,Hate Detected,@user #cnn calls #michigan middle school 'buil...
1,15,Hate Detected,no comment! in #australia #opkillingbay #se...
2,18,Hate Detected,retweet if you agree!
3,24,Hate Detected,@user @user lumpy says i am a . prove it lumpy.
4,35,Hate Detected,it's unbelievable that in the 21st century we'...


In [80]:
df.shape


(4927, 3)

In [81]:
# covert text from object to string 
df['tweet'] = df['tweet'].astype(pd.StringDtype())
df.dtypes

id        int64
label    object
tweet    string
dtype: object

In [82]:
#download stop words 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/williammdavis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [83]:
# process to clean words 
def process_text(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean_words

In [84]:
#show tokenization 
df['tweet'].head().apply(process_text)

0    [user, cnn, calls, michigan, middle, school, b...
1    [comment, australia, opkillingbay, seashepherd...
2                                     [retweet, agree]
3              [user, user, lumpy, says, prove, lumpy]
4    [unbelievable, 21st, century, wed, need, somet...
Name: tweet, dtype: object

In [85]:
#convert collection of text to a matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer
count_v  = CountVectorizer(analyzer=process_text)
message_bow = count_v.fit_transform(df['tweet'])

In [86]:
print(message_bow)

  (0, 11206)	1
  (0, 2272)	1
  (0, 1822)	1
  (0, 6903)	1
  (0, 6908)	1
  (0, 9330)	1
  (0, 1726)	1
  (0, 11432)	1
  (0, 2045)	1
  (0, 10468)	1
  (1, 2360)	1
  (1, 974)	1
  (1, 7751)	1
  (1, 9384)	1
  (1, 4982)	2
  (1, 10596)	1
  (2, 9032)	1
  (2, 474)	1
  (3, 11206)	2
  (3, 6536)	2
  (3, 9309)	1
  (3, 8519)	1
  (4, 11092)	1
  (4, 145)	1
  (4, 2011)	1
  :	:
  (4923, 6266)	1
  (4924, 11206)	2
  (4924, 6536)	1
  (4924, 3230)	1
  (4924, 9307)	1
  (4924, 10791)	1
  (4924, 1097)	1
  (4924, 1006)	1
  (4924, 4877)	1
  (4924, 11509)	1
  (4924, 3081)	1
  (4925, 6610)	1
  (4925, 10569)	1
  (4925, 4543)	2
  (4925, 884)	1
  (4925, 7682)	1
  (4925, 752)	1
  (4926, 11206)	1
  (4926, 9661)	1
  (4926, 10527)	1
  (4926, 11251)	1
  (4926, 1815)	1
  (4926, 11882)	1
  (4926, 2430)	1
  (4926, 371)	1


In [87]:
#split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(message_bow, df['label'],test_size=0.20, random_state=0)

In [88]:
message_bow.shape

(4927, 12625)

In [89]:
#create and train the naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
classifier =  MultinomialNB().fit(X_train, y_train)

In [90]:
#print the prediction
print(classifier.predict(X_train))
#print values
print(y_train.values)

['Hate Detected ' 'No Hate Detected' 'No Hate Detected' ...
 'No Hate Detected' 'No Hate Detected' 'No Hate Detected']
['Hate Detected ' 'No Hate Detected' 'No Hate Detected' ...
 'No Hate Detected' 'No Hate Detected' 'No Hate Detected']


In [91]:
#Evaluate the model on the training data set 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
prediction = classifier.predict(X_train)
print(classification_report(y_train, prediction))
print()

                  precision    recall  f1-score   support

  Hate Detected        0.96      0.97      0.97      1778
No Hate Detected       0.98      0.97      0.97      2163

        accuracy                           0.97      3941
       macro avg       0.97      0.97      0.97      3941
    weighted avg       0.97      0.97      0.97      3941




In [92]:
print('Confusion Matrix: \n', confusion_matrix(y_train,prediction))
print('Accuracy Matrix:',  accuracy_score(y_train,prediction))

Confusion Matrix: 
 [[1733   45]
 [  72 2091]]
Accuracy Matrix: 0.9703121035270236


In [93]:
#print the prediction
print(classifier.predict(X_test))
#print values
print(y_test.values)


['Hate Detected ' 'No Hate Detected' 'No Hate Detected' 'Hate Detected '
 'Hate Detected ' 'No Hate Detected' 'Hate Detected ' 'No Hate Detected'
 'No Hate Detected' 'Hate Detected ' 'Hate Detected ' 'No Hate Detected'
 'Hate Detected ' 'Hate Detected ' 'Hate Detected ' 'No Hate Detected'
 'Hate Detected ' 'No Hate Detected' 'No Hate Detected' 'Hate Detected '
 'Hate Detected ' 'Hate Detected ' 'No Hate Detected' 'No Hate Detected'
 'No Hate Detected' 'Hate Detected ' 'Hate Detected ' 'Hate Detected '
 'No Hate Detected' 'Hate Detected ' 'Hate Detected ' 'No Hate Detected'
 'No Hate Detected' 'Hate Detected ' 'Hate Detected ' 'No Hate Detected'
 'Hate Detected ' 'Hate Detected ' 'No Hate Detected' 'Hate Detected '
 'Hate Detected ' 'No Hate Detected' 'No Hate Detected' 'No Hate Detected'
 'No Hate Detected' 'No Hate Detected' 'Hate Detected ' 'No Hate Detected'
 'Hate Detected ' 'Hate Detected ' 'No Hate Detected' 'No Hate Detected'
 'No Hate Detected' 'No Hate Detected' 'Hate Detected

In [94]:
#Evaluate the model on the training data set 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
prediction = classifier.predict(X_test)
print(classification_report(y_test, prediction))
print()
print('Confusion Matrix: \n', confusion_matrix(y_test,prediction))
print()
print('Accuracy Matrix:',  accuracy_score(y_test,prediction))

                  precision    recall  f1-score   support

  Hate Detected        0.85      0.88      0.86       464
No Hate Detected       0.89      0.86      0.87       522

        accuracy                           0.87       986
       macro avg       0.87      0.87      0.87       986
    weighted avg       0.87      0.87      0.87       986


Confusion Matrix: 
 [[407  57]
 [ 73 449]]

Accuracy Matrix: 0.8681541582150102


In [95]:
nо_hate_example = ['flowers are pretty']
input_bow = count_v.transform(nо_hate_example)
print(input_bow)
classifier.predict(input_bow)

  (0, 4140)	1
  (0, 8384)	1


array(['No Hate Detected'], dtype='<U16')

In [96]:
hate_example = ['get rid of africans']
input_bow = count_v.transform(hate_example)
print(input_bow)
classifier.predict(input_bow)

  (0, 452)	1
  (0, 4461)	1
  (0, 9071)	1


array(['Hate Detected '], dtype='<U16')

In [109]:
hate_example2 = ['build the wall']
input_bow = count_v.transform(hate_example2)
print(input_bow)
classifier.predict(input_bow)

  (0, 1394)	1


array(['Hate Detected '], dtype='<U16')

In [97]:
twitter_df = pd.read_csv("timeline_tweets.csv", encoding = 'unicode_escape',error_bad_lines=False)
twitter_df.head()



Unnamed: 0,tweet_id,text,favorite_count,retweet_count,created_at
0,1389566279527411714,"The stone dates to 1819 âThe Belgian farmer,...",0,0,Tue May 04 13:03:12 +0000 2021
1,1389566111931318278,RT @chelywright: I came out of the closet that...,0,11,Tue May 04 13:02:32 +0000 2021
2,1389566108450107394,UK travel industry shares climb as hopes rise ...,4,0,Tue May 04 13:02:31 +0000 2021
3,1389566008449454083,A portrait of FDR hangs opposite Joe Bidenâs...,2,2,Tue May 04 13:02:07 +0000 2021
4,1389565994415370242,RT @samuelhabte: @jeru13954205 @FBedaso @Neami...,0,1,Tue May 04 13:02:04 +0000 2021


In [112]:
#clean data 
clean_df = twitter_df.drop(['favorite_count','retweet_count','created_at'], 1)
clean_df.head()

Unnamed: 0,tweet_id,text,Hate Speech Classifier
0,1389566279527411714,"The stone dates to 1819 âThe Belgian farmer,...",Hate Detected
1,1389566111931318278,RT @chelywright: I came out of the closet that...,No Hate Detected
2,1389566108450107394,UK travel industry shares climb as hopes rise ...,No Hate Detected
3,1389566008449454083,A portrait of FDR hangs opposite Joe Bidenâs...,Hate Detected
4,1389565994415370242,RT @samuelhabte: @jeru13954205 @FBedaso @Neami...,No Hate Detected


In [113]:
for tweet in twitter_df:
    input_bow = count_v.transform(twitter_df['text'])
#     print(input_bow)
    
    clean_df['Hate Speech Classifier']=classifier.predict(input_bow)


In [115]:
clean_df.head()

Unnamed: 0,tweet_id,text,Hate Speech Classifier
0,1389566279527411714,"The stone dates to 1819 âThe Belgian farmer,...",Hate Detected
1,1389566111931318278,RT @chelywright: I came out of the closet that...,No Hate Detected
2,1389566108450107394,UK travel industry shares climb as hopes rise ...,No Hate Detected
3,1389566008449454083,A portrait of FDR hangs opposite Joe Bidenâs...,Hate Detected
4,1389565994415370242,RT @samuelhabte: @jeru13954205 @FBedaso @Neami...,No Hate Detected


In [116]:
clean_df.to_csv('clean_tweet_analysis.csv', index = False)