# Case Study Module 2: Hate Speech Detection

### By: Sri Krishna Priya Kondapalli

In [2]:
# Importing the necessary libraries
import pandas as pd 
import numpy as np 
import nltk
import re
import string
import unidecode
import pickle
import os
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import sqlite3

  from pandas.core import (


## 1. Initial Checks on the Dataset

In [3]:
# Reading the dataset from local directory using pandas
df_train = pd.read_csv(r"P:\University of munster\Module-2_Data Management\Case study-2\TwitterData\train.csv", encoding='ISO-8859-1')
df_train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
# checking the dataset
df_train.tail()

Unnamed: 0,id,label,tweet
31957,31958,0,ate @user isz that youuu?ð??ð??ð??ð??ð??ð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."
31961,31962,0,thank you @user for you follow


In [5]:
#Checking the shape of the data
df_train.shape

(31962, 3)

In [6]:
#Checking for datatype information and null values
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [7]:
# checking for duplicates in the dataset if any and dropping them
df_train.drop_duplicates(inplace = True)

# There are no duplicate values

## 2. Data Cleaning

In [8]:
# Intialising Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words("english")

# Defining a preprocessor of data cleaning
def preprocessor(tweet):
    
    # Removal of user handles
    tweet = re.sub('@[\w\-]+','', tweet)
    
    # Coverting the string into lower case
    tweet = str(tweet).lower()
    
    tweet = re.sub('\[.*?\]','',tweet)
    
    # Removal of HTML linkups
    tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|''[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',tweet)
    tweet = re.sub('<.*?>+', '', tweet)
    
    # Removal of punctuations
    tweet = re.sub('[%s]' % re.escape(string.punctuation), '', tweet)
    tweet = re.sub('\n','',tweet)
    tweet = re.sub('\w*\d\w*', '', tweet)
    
    # Removal of stopwords
    tweet = [word for word in tweet.split(' ') if word not in stopwords]
    
    #removal of greek characters
    tweet = [' '.join([unidecode.unidecode(word) for word in str(t).split()]) if t is not None else t for t in tweet]
    
    #lemmetizing of tweets
    tweet = [" ".join(lemmatizer.lemmatize(word) for word in t.split()) for t in tweet]
    
    tweet = " ".join(tweet)
    return tweet

**Documenting decisions:**

**Duplicates:** Removed to prevent skewed results.                                                     
**URLs and Mentions:** Removed to keep the focus on content rather than specific user references.     
**Normalization (Lower Text and Lemmatization):** Performed to standardize text.                       
**Stopwords and greek characters removal:** Applied to reduce noise, retaining relevant words for analysis.

In [9]:
# Applying preprocessor on the tweet column
df_train['tweet'] = df_train['tweet'].apply(preprocessor)
df_train.head()

Unnamed: 0,id,label,tweet
0,1,0,father dysfunctional selfish drag kid dysfun...
1,2,0,thanks lyft credit cant use cause dont offer...
2,3,0,bihday majesty
3,4,0,model love u take u time urd+- ddddd|d|d|
4,5,0,factsguide society motivation


## 3. Model Training 

In [11]:
# assigning labels to Y as an array
y = np.array(df_train['label'])

#Converting the text to Bag of words using countvectorizer
bog = CountVectorizer(ngram_range = (2,2))
train_data = bog.fit_transform(df_train['tweet'])


In [13]:
# intialising the train test split
X_train, X_test, y_train, y_test = train_test_split(train_data,y, test_size = 0.33, random_state = 42)

In [15]:
# Creating logistic Regression instance and fitting the data
lr = LogisticRegression()
lr.fit(X_train, y_train)


In [16]:
# Predicting the data using the classifier instance and checking the accuracy
prediction2 = lr.predict(X_test)
accuracy_score(y_test,prediction2)

0.9456769055745164

In [11]:
# Creating the random forest classifier instance and fitting the data
random_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
random_classifier.fit(X_train,y_train)

In [12]:
# Predicting the data using the classifier instance and checking the accuracy
prediction3 = random_classifier.predict(X_test)
accuracy_score(y_test,prediction3)

0.9510807736063709

In [10]:
# Creating the SGDClassifier instance and fitting the data
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log_loss', random_state=1, max_iter=1)

In [14]:
# Training and predicting the data using classifier and checking the accuracy
clf.partial_fit(train_data, train_labels, classes=[0, 1])
prediction4 = clf.predict(X_test)
accuracy_score(y_test,prediction4)

0.9555365946150929

## 4. Predicting on test.csv

In [15]:
# Importing test.csv from local directory
df_test = pd.read_csv(r"P:\University of munster\Module-2_Data Management\Case study-2\TwitterData\test.csv", encoding = 'ISO-8859-1')
df_test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [16]:
#Applying preprocessor for test.csv
test = df_test['tweet'].apply(preprocessor)
test = bog.transform(test)

In [20]:
#Testing the model prediction with text.csv
predict = random_classifier.predict(test)

#Making predictions for tweets in test.csv
df_test['predictions'] = predict
print(df_test[['tweet', 'predictions']])

                                                   tweet  predictions
0      #studiolife #aislife #requires #passion #dedic...            0
1       @user #white #supremacists want everyone to s...            0
2      safe ways to heal your #acne!!    #altwaystohe...            0
3      is the hp and the cursed child book up for res...            0
4        3rd #bihday to my amazing, hilarious #nephew...            0
...                                                  ...          ...
17192  thought factory: left-right polarisation! #tru...            1
17193  feeling like a mermaid Ã°ÂÂÂ #hairflip #nev...            0
17194  #hillary #campaigned today in #ohio((omg)) &am...            0
17195  happy, at work conference: right mindset leads...            0
17196  my   song "so glad" free download!  #shoegaze ...            0

[17197 rows x 2 columns]


In [19]:
#Testing the model and making prediction with text.csv
predict2 = clf.predict(test)
df_test['predictions2'] = predict2
print(df_test[['tweet', 'predictions2']])

                                                   tweet  predictions2
0      #studiolife #aislife #requires #passion #dedic...             0
1       @user #white #supremacists want everyone to s...             0
2      safe ways to heal your #acne!!    #altwaystohe...             0
3      is the hp and the cursed child book up for res...             0
4        3rd #bihday to my amazing, hilarious #nephew...             0
...                                                  ...           ...
17192  thought factory: left-right polarisation! #tru...             1
17193  feeling like a mermaid Ã°ÂÂÂ #hairflip #nev...             0
17194  #hillary #campaigned today in #ohio((omg)) &am...             0
17195  happy, at work conference: right mindset leads...             0
17196  my   song "so glad" free download!  #shoegaze ...             0

[17197 rows x 2 columns]


**As SGDClassifier has highest accuracy and is giving better prediction. I will be using it for the further process**

## 5. Serializing and Deserializing of objects

In [20]:
# Using pickle to seriealise - stopwords, classifier(SGDClassifier) and countVectorizer.
file_dest = os.path.join('HateSpeechDetection', 'pkl_objects')
if not os.path.exists(file_dest):
    os.makedirs(file_dest)

pickle.dump(stopwords, open(os.path.join(file_dest, 'stopwords.pkl'), 'wb'), protocol=4)   
pickle.dump(clf, open(os.path.join(file_dest, 'classifier.pkl'), 'wb'), protocol=4)
pickle.dump(bog, open(os.path.join(file_dest, 'vect_bog.pkl'), 'wb'), protocol=4)

In [21]:
#checking the currect directory
os.getcwd()

'C:\\Users\\Krishnapriya\\University of Munster\\Module 2'

The below code is used to write preprocessor and CountVectorizer into a python file to reuse it whenever necessary. 

In [22]:
%%writefile HateSpeechDetection/vectorizer.py
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
import re
import os
import pickle

# Deserializing the pickle files 
cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'stopwords.pkl'), 'rb'))
vect = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'vect_bog.pkl'), 'rb'))


lemmatizer = WordNetLemmatizer()

def preprocessor(tweet):
    
    # Removal of user handles
    tweet = re.sub('@[\w\-]+','', tweet)
    
    # Coverting the string into lower case
    tweet = str(tweet).lower()
    
    tweet = re.sub('\[.*?\]','',tweet)
    
    # Removal of HTML linkups
    tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|''[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',tweet)
    tweet = re.sub('<.*?>+', '', tweet)
    
    # Removal of punctuations
    tweet = re.sub('[%s]' % re.escape(string.punctuation), '', tweet)
    tweet = re.sub('\n','',tweet)
    tweet = re.sub('\w*\d\w*', '', tweet)
    
    # Removal of stopwords
    tweet = [word for word in tweet.split(' ') if word not in stop]
    
    #removal of greek characters
    tweet = [' '.join([unidecode.unidecode(word) for word in str(t).split()]) if t is not None else t for t in tweet]
    
    #lemmetizing of tweets
    tweet = [" ".join(lemmatizer.lemmatize(word) for word in t.split()) for t in tweet]
    
    tweet = " ".join(tweet)
    return tweet

#vect = CountVectorizer()

def process_tweet(tweet):
    # Process the tweet
    processed_tweet = preprocessor(tweet)

    
    vect.transform([processed_tweet])  # Pass a list of processed_tweet

    return processed_tweet

Overwriting HateSpeechDetection/vectorizer.py


In [23]:
#Changing the current directory to Hate Speech Detection
os.chdir('HateSpeechDetection')

In [24]:
#Importing the countVectorizer embedded into vectoriser and deserializing the classifier
from vectorizer import vect

clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))

In [25]:
# Testing if the Deserialization of objects is working fine.
import numpy as np
label = {0:'Not Hate Speech', 1:'Hate Speech'}

example = ["America is good in tech"]
X = vect.transform(example)

prediction = clf.predict(X)
probability = clf.predict_proba(X)

print('Prediction: %s\nProbability: %.2f%%' % (label[prediction[0]], np.max(probability) * 100))

Prediction: Not Hate Speech
Probability: 93.65%


## Setting up SQLite connection

In [26]:
# Getting the currect directory
os.getcwd()

'C:\\Users\\Krishnapriya\\University of Munster\\Module 2\\HateSpeechDetection'

In [28]:
#Setting up SQLite3 connection

conn = sqlite3.connect('speech.sqlite')
c = conn.cursor()

c.execute('DROP TABLE IF EXISTS speech_db')
c.execute('CREATE TABLE speech_db (speech TEXT,type INTEGER, date TEXT)')

example1 = 'America is a rasict country'
c.execute("INSERT INTO speech_db (speech, type, date) VALUES (?, ?, DATETIME('now'))", (example1, 1))

example2 = 'america is good with tech'
c.execute("INSERT INTO speech_db (speech, type, date) VALUES (?, ?, DATETIME('now'))", (example2, 0))

conn.commit()
conn.close()

## End 