In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import nltk
import re
import string

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

stopword = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data = pd.read_csv(r"C:\Users\kumar\Downloads\labeled_data.csv\labeled_data.csv")

# To preview the data
print(data.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [4]:
data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Speech", 2: "No Hate and Offensive Speech"})
data = data[["tweet", "labels"]]

print(data.head())

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                         labels  
0  No Hate and Offensive Speech  
1              Offensive Speech  
2              Offensive Speech  
3              Offensive Speech  
4              Offensive Speech  


In [5]:
def clean(text):
    text = str(text).lower()
    text = re.sub('[.?]', '', text)
    text = re.sub('https?://\S+|www.\S+', '', text)
    text = re.sub('<.?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w\d\w', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text

data["tweet"] = data["tweet"].apply(clean)
x = np.array(data["tweet"])
y = np.array(data["labels"])

In [6]:
cv = CountVectorizer()
X = cv.fit_transform(x)

# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Model building
model = DecisionTreeClassifier()

# Training the model
model.fit(X_train, y_train)

# Testing the model
y_pred = model.predict(X_test)
print(y_pred)

['Offensive Speech' 'Offensive Speech' 'Offensive Speech' ...
 'Offensive Speech' 'Offensive Speech' 'Offensive Speech']


In [7]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))



0.8795696295390635


In [8]:
# Predicting the outcome
inp = "the boy was there when the sun goes if it is usede"
#inp=" hes a beaner smh you can tell hes a mexican"
#inp=" you're fucking black, blacklisted hoe"
inp = cv.transform(["the boy was there when the sun goes if it is usede"]).toarray()
print(inp)
print(model.predict(inp))

[[0 0 0 ... 0 0 0]]
['No Hate and Offensive Speech']


In [9]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
percentage_accuracy = accuracy * 100
print(f"Accuracy: {percentage_accuracy:.2f}%")


Accuracy: 87.96%


In [10]:
import joblib
joblib.dump(model,r'C:\Users\kumar\Path_to_store_transcript_file\model.pkl')
joblib.dump(cv,r'C:\Users\kumar\Path_to_store_transcript_file\vectorizer.pkl')

['C:\\Users\\kumar\\Path_to_store_transcript_file\\vectorizer.pkl']

['C:\\Users\\kumar\\Path_to_store_transcript_file\\vectorizer.pkl']