Import Packages

In [1]:
import pandas as pd
import numpy as np
import nltk

import re
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from nltk.util import pr
from nltk.corpus import stopwords



Import the Dataset

In [2]:
data=pd.read_csv("twitter.csv")
print(data.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [3]:
stemmer=nltk.SnowballStemmer("english")

Map the columns for Hate Speech

In [4]:
data["labels"]=data["class"].map({0:"Hate Speeh" , 1:"Offensive Language" , 2: "Normal"})
print(data.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet              labels  
0  !!! RT @mayasolovely: As a woman you shouldn't...              Normal  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  Offensive Language  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  Offensive Language  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  Offensive Language  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  Offensive Language  


In [5]:
data=data[["tweet", "labels"]]
print(data.head())

                                               tweet              labels
0  !!! RT @mayasolovely: As a woman you shouldn't...              Normal
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  Offensive Language
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  Offensive Language
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  Offensive Language
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  Offensive Language


Clean the Dataset

In [6]:
def clean(text):
    text=str(text).lower()
    text=re.sub('\[.*?\]', '',text) #regular expressions  removes any text within square brackets, including the brackets themselves.
    text=re.sub('https?://\S+|www\.\S+', '',text) #removing urls
    text=re.sub('<.*?>+', '', text) #removing html tags
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\n','',text)
    text=re.sub('\w*\d\w*', '', text)
    text=[word for word in text.split(' ') if word not in stopword]
    text= " ".join(text)
    text=[stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text


    data["tweet"]=data["tweet"].apply(clean)

Train Dataset Using Decision Tree Classifier

In [7]:
x=np.array(data["tweet"])
y=np.array(data["labels"])

cv = CountVectorizer()
X = cv.fit_transform(x) #FIT THE DATA
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.33, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)



Validate the Dataset

In [13]:
sample=" Keeks is a bitch she curves everyone  lol I walked into a conversation like this. Smh"
data=cv.transform([sample]).toarray()

print(clf.predict(data))

['Offensive Language']


In [14]:
sample=" I love everyone and believe in kindness to all."
data=cv.transform([sample]).toarray()

print(clf.predict(data))

['Normal']
