## Importing The Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing The Dataset

In [2]:
dataset= pd.read_csv('HateSpeechDataMod.csv') 
## delimiter \t because tsv
## quoting to tell it to ignore inverted commas

## Cleaning the texts

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus= []
for i in range(0,5000):
    text= re.sub('[^a-zA-Z]', ' ', dataset['Text'][i])
    text= text.lower()
    text= text.split()
    ps= PorterStemmer()
    all_stopwords= stopwords.words('english')
    all_stopwords.remove('not')
    text= [ps.stem(word) for word in text if not word in set(all_stopwords)]
    text= ' '.join(text)
    corpus.append(text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words Model

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=13000)
x= cv.fit_transform(corpus).toarray()
y= dataset.iloc[:,-1].values

## TF- IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect= TfidfVectorizer()
x_tfidf= tfidf_vect.fit_transform(corpus)
x2= pd.DataFrame(x_tfidf.toarray())
y2= dataset.iloc[:5000,-1].values

In [6]:
x2.shape

(5000, 2638)

## Splitting the dataset into Training set and Test set

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x2, y2, test_size= 0.2, random_state=0)

## Training the Naive Bayes Model on the Training Set (SVM was Best)

In [22]:
# from sklearn.naive_bayes import GaussianNB
# classifier= GaussianNB()
# classifier.fit(x_train, y_train)

In [23]:
# from sklearn.tree import DecisionTreeClassifier
# classifier=DecisionTreeClassifier(criterion='entropy')
# classifier.fit(x_train,y_train)

In [24]:
# from sklearn.neighbors import KNeighborsClassifier
# classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
# classifier.fit(x_train, y_train)

In [25]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(random_state=0, max_iter=40623)
classifier.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=40623,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
# from sklearn.ensemble import RandomForestClassifier
# classifier=RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
# classifier.fit(x_train,y_train)

In [27]:
# from sklearn.svm import SVC
# classifier=SVC(kernel='linear', random_state=0)
# classifier.fit(x_train,y_train)

In [28]:
# from sklearn.svm import SVC
# classifier = SVC(kernel = 'rbf', random_state = 0)
# classifier.fit(x_train, y_train)

## Predicting the Test set Results

In [29]:
y_pred=classifier.predict(x_test)

In [19]:
y_pred

array([1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,

In [None]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

## Making the Confusion Matrix

In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[279  45]
 [  9 667]]


0.946

In [31]:
from sklearn.metrics import recall_score
recall_score(y_test, y_pred)

0.9866863905325444

In [32]:
from sklearn.metrics import precision_score
precision_score(y_test, y_pred)

0.9367977528089888

In [33]:
new_review = 'I love gays'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
x_tfidf= tfidf_vect.transform(new_corpus)
new_X_test= pd.DataFrame(x_tfidf.toarray())
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[1]
