In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
import nltk
nltk.download('wordnet')
nltk.download("stopwords")
from nltk.corpus import stopwords
import string
import contractions
from unidecode import unidecode
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from autocorrect import Speller
from nltk.stem import WordNetLemmatizer
from string import punctuation
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
data = pd.read_csv("HateSpeechDetection.csv")

In [None]:
data.head()

In [None]:
data = data.drop("Platform", axis=1)

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data['Hateful'].value_counts()

Oversampling

In [None]:
hateful_0_count = data[data['Hateful'] == 0]['Comment'].count()
hate = data[data['Hateful'] == 1]
no_hate = data[data['Hateful'] == 0]
hate_oversample = hate.sample(hateful_0_count, replace=True)
data_oversampled = pd.concat([no_hate, hate_oversample], axis=0)

print('Random over-sampling:')
print(data_oversampled['Hateful'].value_counts())

In [None]:
data_oversampled.info()

Visualization

In [None]:
fig, axs = plt.subplots(1,2 , figsize=(16,8))
text_pos = " ".join(data['Comment'][data.Hateful == 0])
text_neg = " ".join(data['Comment'][data.Hateful == 1])
data_pos = WordCloud(collocations = False, background_color = 'white').generate(text_pos)
data_neg = WordCloud(collocations = False, background_color = 'black').generate(text_neg)
axs[0].imshow(data_pos, interpolation='bilinear')
axs[0].axis('off')
axs[0].set_title('Non-Hate Comments')
axs[1].imshow(data_neg, interpolation='bilinear')
axs[1].axis('off')
axs[1].set_title('Hate Comments')

plt.show()

Vertical Splitting

In [None]:
x =data_oversampled["Comment"].values
y =data_oversampled["Hateful"].values

In [None]:
x.shape

In [None]:
y.shape

In [None]:
data_oversampled

Data Preprocessing

In [None]:
stopword_list = stopwords.words("english")

def clean_data(data):
    tokens = word_tokenize(data)
    clean_text = [word.lower() for word in tokens if (word not in punctuation) and(word.lower() not in stopword_list) and(len(word)>2) and (word.isalpha())]
    return clean_text

def lemmatization(data1):
    lemmatizer = WordNetLemmatizer()
    final_data1 = []
    for word in data1:
        lemmatized_word = lemmatizer.lemmatize(word)
        final_data1.append(lemmatized_word)
    return " ".join(final_data1)


In [None]:
data_oversampled["Comment"] = data_oversampled["Comment"].apply(clean_data)
data_oversampled["Comment"] = data_oversampled["Comment"].apply(lemmatization)

In [None]:
data_oversampled

Vectorization

In [None]:
tfidf=TfidfVectorizer()

In [None]:
x=tfidf.fit_transform(x)

In [None]:
x=x.toarray()

In [None]:
x.shape

Horizontal Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [None]:
print("xtrain: ", xtrain.shape)
print("ytrain: ", ytrain.shape)
print("xtest: ", xtest.shape)
print("ytest: ", ytest.shape)

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model1 = DecisionTreeClassifier()

In [None]:
model1.fit(xtrain,ytrain)

In [None]:
predict1=model1.predict(xtest)

In [None]:
accuracy1 = accuracy_score(ytest,predict1)

In [None]:
print("Confusion Matrix: \n", confusion_matrix(ytest, predict1))

In [None]:
print("Accuracy of Decision Tree :", accuracy1)

In [None]:
test_data = "Desecrate men by making them gay/trannies, and women by making them abort children and become psycho feminist cunts etc"
data = tfidf.transform([test_data]).toarray()
print(model1.predict(data))

In [None]:
test_data = "Saudi Arabia has tv? I thought they just watched public rapes for entertainment"
data = tfidf.transform([test_data]).toarray()
print(model1.predict(data))

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model2=LogisticRegression()

In [None]:
model2.fit(xtrain,ytrain)

In [None]:
predict2=model2.predict(xtest)

In [None]:
accuracy2 = accuracy_score(ytest,predict2)

In [None]:
print("Accuracy of Logistic Regression :",accuracy2)

In [None]:
print("Confusion Matrix: \n", confusion_matrix(ytest, predict2))

Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
model3= BernoulliNB(binarize= 0.0)

In [None]:
model3.fit(xtrain, ytrain)

In [None]:
predict3= model3.predict(xtest)

In [None]:
accuracy3 = accuracy_score(ytest, predict3)

In [None]:
print("Accuracy of Bernouli Naive Bayes :",accuracy_score(ytest,predict3))

In [None]:
print("Confusion Matrix: \n", confusion_matrix(ytest, predict3))

In [None]:
test_data = "you are sweet"
df = tfidf.transform([test_data]).toarray()
print(model3.predict(df))

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model4 = RandomForestClassifier(n_estimators = 50)

In [None]:
model4.fit(xtrain, ytrain)

In [None]:
predict4 = model4.predict(xtest)

In [None]:
accuracy4 = accuracy_score(ytest, predict4)
print("Accuracy Score:", accuracy4)

In [None]:
test_data = "The Irony of calling the Asian guy/girl a monkey is just too funny to be true."
df = tfidf.transform([test_data]).toarray()
print(model4.predict(df))

In [None]:
test_data = "you are good"
df = tfidf.transform([test_data]).toarray()
print(model4.predict(df))

Gradient Boosting

In [None]:
scaler=MinMaxScaler() #normalizing
xtrain_scaled = scaler.fit_transform(xtrain)
xtest_scaled = scaler.transform(xtest)

In [None]:
lr_list=[0.05,0.075,0.1,0.25,0.5,0.75,1]

In [None]:
for learning_rate in lr_list:
  model5 = GradientBoostingClassifier(n_estimators=100, learning_rate=learning_rate,max_features=2,max_depth=2,random_state=0)
  model5.fit(xtrain_scaled,ytrain)
  predict5 = model5.predict(xtest_scaled)

  print("Learning Rate: ", learning_rate)
  print("Accuracy_score (training): {0:.3f}".format(model5.score(xtrain_scaled,ytrain)))
  print("Accuracy_score (testing): {0:.3f}".format(model5.score(xtest_scaled,ytest)))
  #with which learning rate best score

In [None]:
model5 = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.25,max_features=2,max_depth=2,random_state=0)
model5.fit(xtrain_scaled,ytrain)
predict5 = model5.predict(xtest_scaled)

In [None]:
print("Accuracy Score of Gradient Boosting: ", model5.score(xtest_scaled, ytest))

In [None]:
test_data = "The Irony of calling the Asian guy/girl a monkey is just too funny to be true. "
df = tfidf.transform([test_data]).toarray()
print(model5.predict(df))

Ada Boosting

In [None]:
model6=AdaBoostClassifier(n_estimators=50)

In [None]:
model6.get_params

In [None]:
model6.fit(xtrain_scaled,ytrain)

In [None]:
y_pred=model6.predict(xtest)

In [None]:
print(accuracy_score(ytest,y_pred))

XGBoost

In [None]:
import xgboost as xgb

In [None]:
xgb_classifier = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb_classifier.fit(xtrain, ytrain)
y_pred = xgb_classifier.predict(xtest)

In [None]:
print("Accuracy Score XGBoost: ", accuracy_score(ytest, y_pred))

Stacking

In [None]:
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor

In [None]:

def get_stacking():
  level0=list()
  level0.append(('knn',KNeighborsRegressor()))
  level0.append(('cart',DecisionTreeRegressor()))
  level0.append(('svm',SVR()))
  level1=LinearRegression()
  model=StackingRegressor(estimators=level0,final_estimator=level1) # to create main model
  return model

In [None]:
#retrieving models
def get_model():
  models=dict()
  # models['knn']=KNeighborsRegressor()
  # models['cart']=DecisionTreeRegressor()
  # models['svm']=SVR()
  models['stacking']=get_stacking()
  return models

In [None]:
def evaluate_model(model,x,y):
  cv=RepeatedKFold(n_splits=10,n_repeats=3,random_state=1)
  scores=cross_val_score(model,x,y,scoring='neg_mean_absolute_error',cv=cv)
  return scores

In [None]:
models=get_model()
results,names=list(),list()
for name,model in models.items():
  scores=evaluate_model(model,x,y)
  results.append(scores)
  names.append(model)
  print(name,mean(scores))



In [None]:
print("Accuracy of Decision Tree :",accuracy_score(ytest,predict1))
print("Accuracy of Logisitc Regression :",accuracy)
print("Accuracy of Naives Bayes:",accuracy_score(ytest,prediction3))
print("Accuracy of Random Forest :",accuracy2)
print("Accuracy of Ada Bosting :",metrics.accuracy_score(ytest,y_pred))
print("Accuracy of Stacking :",accuracy_score(ytest,predict1))

Evaluating Accuracy a Score

In [None]:
objects = ('Logistic', 'RandomForest', 'Naive_bayes', 'SVM')
y_pos = np.arange(len(objects))
performance = [acc,acc1,acc2,acc3]
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Accuracy')
plt.title('Algorithm Comparision')
plt.show()

In [None]:
plt.plot(fpr_np,tpr_np,label="Naive Bayes, AUC="+str(auc_np))
plt.plot(fpr_dt,tpr_dt,label="Decision Tree, AUC="+str(auc_dt))
plt.plot(fpr_knn,tpr_knn,label="K-Nearest Neighbors, AUC="+str(auc_knn))
plt.plot(fpr_lr,tpr_lr,label="Logistic Regression, AUC="+str(auc_lr))
plt.plot(fpr_rf,tpr_rf,label="Random Forest, AUC="+str(auc_rf))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')

#add legend
plt.legend()