### Hate Speech Detection using Python

In [1]:
from nltk.util import pr
# Pretty print a sequence of data items
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer Converts a collection of text documents to a matrix of token counts
from sklearn.model_selection import train_test_split

In [3]:
import re
import nltk

In [4]:
stemmer=nltk.SnowballStemmer('english')

In [5]:
from nltk.corpus import stopwords

In [6]:
import string
stopword=set(stopwords.words('english'))
data = pd.read_csv("twitter.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [7]:
del data['Unnamed: 0']

In [8]:
data.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [9]:
data['class'].unique()

array([2, 1, 0], dtype=int64)

We have only 3 unique values in Class column

In [11]:
# data['labels'].unique()

In [12]:
data['labels']=data['class'].map({0:"hate speech",1: "Offensive Language", 2: "No Hate and Offensive"})

In [13]:
data.head(2)

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,labels
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate and Offensive
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language


Now we will work over "tweet: and "labels" columns only for our detection model

In [14]:
data=data[['tweet','labels']]

In [15]:
data.head(2)

Unnamed: 0,tweet,labels
0,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate and Offensive
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language


In [16]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["tweet"] = data["tweet"].apply(clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tweet"] = data["tweet"].apply(clean)


In [17]:
data.head(2)

Unnamed: 0,tweet,labels
0,rt mayasolov woman shouldnt complain clean ho...,No Hate and Offensive
1,rt boy dat coldtyga dwn bad cuffin dat hoe ...,Offensive Language


In [18]:
X=np.array(data['tweet'])
y=np.array(data['labels'])

In [19]:
cv=CountVectorizer()

In [20]:
X=cv.fit_transform(X)

In [21]:
X

<24783x25540 sparse matrix of type '<class 'numpy.int64'>'
	with 198648 stored elements in Compressed Sparse Row format>

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=41)

In [23]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()

In [24]:
model.fit(X_train,y_train)

DecisionTreeClassifier()

Now let’s test this machine learning model to see if it detects hate speech or not:

In [25]:
sample = "Let's unite and kill all the people who are protesting against the government"
sample=cv.transform([sample]).toarray()

In [26]:
model.predict(sample)

array(['hate speech'], dtype=object)

In [34]:
# 