In [1]:
#Importing Required Libraries and Modules

from nltk.util import pr
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
#Loading Dataset

data = pd.read_csv("/content/drive/MyDrive/6th_SEM_MINI_PROJECT/twitter.csv")
print(data.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [4]:
#Defining Labels

data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "No Hate and Offensive"})
print(data.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet                 labels  
0  !!! RT @mayasolovely: As a woman you shouldn't...  No Hate and Offensive  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...     Offensive Language  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...     Offensive Language  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...     Offensive Language  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...     Offensive Language  


In [5]:
#Labelling Tweets with Labels

data = data[["tweet", "labels"]]
print(data.head())

                                               tweet                 labels
0  !!! RT @mayasolovely: As a woman you shouldn't...  No Hate and Offensive
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...     Offensive Language
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...     Offensive Language
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...     Offensive Language
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...     Offensive Language


In [6]:
#Importing RE Module and NLTK

import re
import nltk

In [7]:
#Stemming

stemmer = nltk.SnowballStemmer("english")

In [8]:
#Collecting Stop-words for English Language
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))
print (stopword)

{'when', 'should', 'at', 'if', 'own', "won't", 'but', 'into', "it's", "you've", 'be', 'some', 'have', 'there', 'shouldn', 'haven', 'whom', "needn't", "shouldn't", 'does', 'they', 'their', 'couldn', 'for', 'hadn', 'ain', 'then', 'do', 'will', 'hers', 'he', 'other', 'o', 'ma', 'over', "that'll", 'mightn', 'yourself', 'our', 'm', 'it', "you'll", 'are', 'too', 'nor', 'of', 'hasn', 'doing', 'don', 'doesn', 'an', 'was', 'the', 'you', 'by', 'before', 'itself', 'from', 'up', 'down', 'same', 'only', "hasn't", 'themselves', 'is', 'than', 'any', 'against', 'further', "haven't", 'both', 'having', 'theirs', 'just', "aren't", 'them', 'being', 'a', 'how', 'd', 'i', 'with', "don't", 'aren', 'did', 'myself', 'can', 'and', 'more', 'mustn', 'him', 'ours', "couldn't", 'isn', 'were', 'about', 'on', 'until', 'didn', 'once', 'won', 'very', "hadn't", "didn't", "you'd", 'through', 'out', 'after', 'those', 'your', 'because', 'll', 'which', 'weren', 'few', 'had', 'wasn', 'so', 'each', "she's", 'such', 'his', 'th

In [9]:
#Cleaning Data and Focusing on main words

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["tweet"] = data["tweet"].apply(clean)
print(data.head())

                                               tweet                 labels
0   rt mayasolov woman shouldnt complain clean ho...  No Hate and Offensive
1   rt  boy dat coldtyga dwn bad cuffin dat hoe  ...     Offensive Language
2   rt urkindofbrand dawg rt  ever fuck bitch sta...     Offensive Language
3             rt cganderson vivabas look like tranni     Offensive Language
4   rt shenikarobert shit hear might true might f...     Offensive Language


In [10]:
#Generating Seperate Arrays for tweet and labels

x = np.array(data["tweet"])
y = np.array(data["labels"])
print(x)
print("\n")
print(y)

[' rt mayasolov woman shouldnt complain clean hous amp man alway take trash'
 ' rt  boy dat coldtyga dwn bad cuffin dat hoe  place'
 ' rt urkindofbrand dawg rt  ever fuck bitch start cri confus shit' ...
 'young buck wanna eat dat nigguh like aint fuckin dis'
 'youu got wild bitch tellin lie'
 'ruffl  ntac eileen dahlia  beauti color combin pink orang yellow amp white coll ']


['No Hate and Offensive' 'Offensive Language' 'Offensive Language' ...
 'Offensive Language' 'Offensive Language' 'No Hate and Offensive']


In [11]:
#Extracting features from text

cv = CountVectorizer()
X = cv.fit_transform(x)
print(X)

  (0, 18926)	1
  (0, 14029)	1
  (0, 24862)	1
  (0, 19960)	1
  (0, 4421)	1
  (0, 4115)	1
  (0, 9941)	1
  (0, 634)	1
  (0, 13754)	1
  (0, 545)	1
  (0, 21592)	1
  (0, 22923)	1
  (1, 18926)	1
  (1, 2639)	1
  (1, 5252)	2
  (1, 4308)	1
  (1, 6409)	1
  (1, 1400)	1
  (1, 4931)	1
  (1, 9670)	1
  (1, 17140)	1
  (2, 18926)	2
  (2, 23682)	1
  (2, 5296)	1
  (2, 6930)	1
  :	:
  (24780, 24191)	1
  (24780, 6470)	1
  (24780, 7960)	1
  (24780, 5813)	1
  (24780, 15617)	1
  (24780, 2997)	1
  (24781, 2128)	1
  (24781, 8675)	1
  (24781, 12909)	1
  (24781, 24713)	1
  (24781, 21857)	1
  (24781, 25488)	1
  (24782, 634)	1
  (24782, 24595)	1
  (24782, 25313)	1
  (24782, 1733)	1
  (24782, 4344)	1
  (24782, 16346)	1
  (24782, 17092)	1
  (24782, 4357)	1
  (24782, 18968)	1
  (24782, 15902)	1
  (24782, 6578)	1
  (24782, 5079)	1
  (24782, 4319)	1


In [12]:
#Splitting Dataset using train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train)
print("\n")
print(X_test)
print("\n")
print(y_train)
print("\n")
print(y_test)
print("\n")

  (0, 2128)	1
  (0, 12984)	1
  (0, 20429)	1
  (0, 149)	1
  (1, 2128)	1
  (1, 19866)	1
  (1, 6663)	1
  (1, 22353)	1
  (1, 1059)	1
  (1, 21471)	1
  (1, 24246)	1
  (1, 8454)	1
  (1, 20614)	1
  (2, 18926)	1
  (2, 25266)	1
  (2, 17788)	1
  (2, 23930)	1
  (2, 20142)	1
  (2, 20095)	1
  (2, 7445)	1
  (2, 17643)	1
  (2, 21129)	1
  (2, 1947)	1
  (2, 4852)	1
  (2, 10287)	1
  :	:
  (16600, 19729)	1
  (16600, 25396)	1
  (16600, 2672)	1
  (16600, 6614)	1
  (16601, 17907)	1
  (16601, 9188)	1
  (16601, 17400)	1
  (16601, 17352)	1
  (16601, 664)	2
  (16601, 2839)	1
  (16602, 18926)	1
  (16602, 8329)	1
  (16602, 7392)	1
  (16602, 17788)	1
  (16602, 4668)	1
  (16602, 830)	1
  (16602, 971)	1
  (16602, 16922)	1
  (16602, 22420)	1
  (16602, 11712)	1
  (16602, 3302)	1
  (16602, 1989)	1
  (16603, 2128)	1
  (16603, 19866)	1
  (16603, 19290)	1


  (0, 2128)	1
  (0, 8675)	1
  (0, 14552)	1
  (0, 3249)	1
  (0, 25362)	1
  (1, 18926)	1
  (1, 1400)	1
  (1, 7937)	1
  (1, 2128)	1
  (1, 15360)	1
  (1, 14735)	1
  (1, 860

In [13]:
#Measuring quality of Splitting Dataset

clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.8781024575131434

In [14]:
#Detecting Hate Speech

def hate_speech_detection():
  
    user = input("Enter any Tweet: ")
    if len(user) < 1:
        print(" ")
    else:
        sample = user
        data = cv.transform([sample]).toarray()
        a = clf.predict(data)
        print(a)
hate_speech_detection()

Enter any Tweet: kill that dog
['Hate Speech']
