### Twitter Hate Speech Prediction using Decision Tree

In [26]:
# Pseudocode for the Hate Speech Prediction

    >Select only text and target columns as new df
    >Select X,y variables
    >Clean input X using regex
    >Label Encode target y variable
    >Transform X using Count Vectorizer
    >Train test Split
    >Model fitting using Decision Tree
    >Beofre Prediction, Provide user input as list i. e. cv.transform([input])

In [14]:
from nltk.util import pr
# Pretty print a sequence of data items
import pandas as pd
import numpy as np
import re
import nltk

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [16]:
stemmer=nltk.SnowballStemmer('english')

In [17]:
from nltk.corpus import stopwords

In [28]:
import string
stopword=set(stopwords.words('english'))
data = pd.read_csv("twitter.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [29]:
del data['Unnamed: 0']

In [30]:
# data.head()

In [31]:
data['class'].unique()

array([2, 1, 0], dtype=int64)

We have only 3 unique values in Class column

In [32]:
data['labels']=data['class'].map({0:"hate speech",1: "Offensive Language", 2: "No Hate and Offensive"})

In [33]:
data.head(2)

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,labels
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate and Offensive
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language


Now we will work over "tweet: and "labels" columns only for our detection model

In [34]:
data=data[['tweet','labels']]

In [35]:
data.head(2)

Unnamed: 0,tweet,labels
0,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate and Offensive
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language


In [36]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
#     text = [stemmer.stem(word) for word in text.split(' ')]
#     text=" ".join(text)
    return text
data["tweet"] = data["tweet"].apply(clean)

In [37]:
data.head()

Unnamed: 0,tweet,labels
0,rt mayasolovely woman shouldnt complain clean...,No Hate and Offensive
1,rt boy dats coldtyga dwn bad cuffin dat hoe ...,Offensive Language
2,rt urkindofbrand dawg rt ever fuck bitch sta...,Offensive Language
3,rt cganderson vivabased look like tranny,Offensive Language
4,rt shenikaroberts shit hear might true might ...,Offensive Language


In [38]:
X=np.array(data['tweet'])
y=np.array(data['labels'])

In [39]:
cv=CountVectorizer()

In [40]:
X=cv.fit_transform(X)

In [41]:
X

<24783x29831 sparse matrix of type '<class 'numpy.int64'>'
	with 199586 stored elements in Compressed Sparse Row format>

In [42]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=41)

In [43]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()

In [44]:
model.fit(X_train,y_train)

DecisionTreeClassifier()

>Now let’s test this machine learning model to see if it detects hate speech or not:

In [45]:
sample = "Let's unite and kill all the people who are protesting against the government"
sample_data=cv.transform([sample]).toarray()

In [46]:
model.predict(sample_data)

array(['hate speech'], dtype=object)

In [27]:
# My implementation

In [1]:
# loading basic library
import re
import string
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

In [2]:
# load dataset

In [3]:
data = pd.read_csv("twitter.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [8]:
data['class'].unique()

array([2, 1, 0], dtype=int64)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [10]:
# Choose  required columns as new df
df=data[['tweet','class']]
df.head()

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,2
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1


In [12]:
# Map target varaibles
df['class']=df['class'].map({0:'offensive',1:'no offensive',2:'neither'})
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['class']=df['class'].map({0:'offensive',1:'no offensive',2:'neither'})


Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,neither
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,no offensive
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,no offensive
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,no offensive
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,no offensive


In [13]:
# text cleaning

df['tweet']=data['tweet'].str.lower()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet']=data['tweet'].str.lower()


In [None]:
x=df.drop('class',axis=1)
y=df.class

In [32]:
# train test split

In [33]:
# model fitting

In [None]:
# model prediction