### Import Libraries

In [1]:
import numpy as np 
import pandas as pd 
import os
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

### Import Dataset

In [2]:
df = pd.read_csv("test_data.csv")
print(df)

                                                  Chat              Label
0                                  who the fucks emong      cyberbullying
1                             BAN THIS FUCKING R-TARD!      cyberbullying
2                    USE YOUR FUCKING FLASHBANG PLEASE      cyberbullying
3                     DOC JUST FUCKIN DO IT OKAY 4Head      cyberbullying
4                  will you shut the fuck up for a bit      cyberbullying
..                                                 ...                ...
995                        You are gorgeous I love you  non-cyberbullying
996            And you can show your dog your <3 <3 <3  non-cyberbullying
997                                   u look fantastic  non-cyberbullying
998  you look like scarlet Johnson i bet you here t...  non-cyberbullying
999      you're looking so great, love your streams <3  non-cyberbullying

[1000 rows x 2 columns]


### Data Prepocessing

In [3]:
corpus = []

for i in range(0, len(df)):
    #only alphabet allowed
    review = re.sub('[^a-zA-Z]', ' ', df['Chat'][i])
    #set to lowercase
    review = review.lower()
    #tokenize
    review = review.split()
    review = ' '.join(review)
    corpus.append(review)

#bag of words transformer
bow_transformer = CountVectorizer(stop_words='english')
bow_transformer = bow_transformer.fit(corpus)

print('Length of the Vocabulary: ',len(bow_transformer.vocabulary_))
messages_bow = bow_transformer.transform(corpus)
tfidf_transformer = TfidfTransformer().fit(messages_bow)
X = tfidf_transformer.transform(messages_bow)

y = []
for row in df["Label"]:
    y.append(row)

Length of the Vocabulary:  1068


In [4]:
#print(X)

In [5]:
# print idf values
#df_idf = pd.DataFrame(tfidf_transformer.idf_, index=bow_transformer.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
#print(df_idf)

### Model selection - setting train and test size

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)

x_train_std = sc.fit_transform(X_train)
x_test_std = sc.transform(X_test)

### Naive Bayes

In [7]:
from sklearn.naive_bayes  import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
print('Naive Bayes Results')
print('Accuracy: %.2f ' %accuracy_score(y_test,y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)
print('\nF1 Score: ', f1_score(y_test, y_pred, average="macro"))
print('Precision Score: ', precision_score(y_test, y_pred, average="macro"))
print("Recall Score: ",recall_score(y_test, y_pred, average="macro"))

Naive Bayes Results
Accuracy: 0.97 

Confusion Matrix
 [[158   7]
 [  3 162]]

F1 Score:  0.9696925168068771
Precision Score:  0.9699731706420669
Recall Score:  0.9696969696969697


### Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
print('Decision Tree Results')
print('Accuracy: %.2f ' %accuracy_score(y_test,y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)
print('\nF1 Score: ', f1_score(y_test, y_pred, average="macro"))
print('Precision Score: ', precision_score(y_test, y_pred, average="macro"))
print("Recall Score: ",recall_score(y_test, y_pred, average="macro"))

Decision Tree Results
Accuracy: 0.98 

Confusion Matrix
 [[157   8]
 [  0 165]]

F1 Score:  0.9757433202249255
Precision Score:  0.976878612716763
Recall Score:  0.9757575757575758


### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=1, n_jobs=2)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
print('Random Forest Results')
print('Accuracy: %.2f ' %accuracy_score(y_test,y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)
print('\nF1 Score: ', f1_score(y_test, y_pred, average="macro"))
print('Precision Score: ', precision_score(y_test, y_pred, average="macro"))
print("Recall Score: ",recall_score(y_test, y_pred, average="macro"))

Random Forest Results
Accuracy: 0.99 

Confusion Matrix
 [[163   2]
 [  0 165]]

F1 Score:  0.9939391713194241
Precision Score:  0.9940119760479043
Recall Score:  0.9939393939393939


### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1000.0, random_state=0, solver='lbfgs')
lr.fit(x_train_std, y_train)
y_pred = lr.predict(x_test_std)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
print('Logistic Regression Results')
print('Accuracy: %.2f ' %accuracy_score(y_test,y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)
print('\nF1 Score: ', f1_score(y_test, y_pred, average="macro"))
print('Precision Score: ', precision_score(y_test, y_pred, average="macro"))
print("Recall Score: ",recall_score(y_test, y_pred, average="macro"))

Logistic Regression Results
Accuracy: 0.99 

Confusion Matrix
 [[161   4]
 [  0 165]]

F1 Score:  0.9878770067227509
Precision Score:  0.9881656804733727
Recall Score:  0.9878787878787878


### Support Vector Machines

In [11]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=0, gamma=0.10, C=10)
svm.fit(x_train_std, y_train)
y_pred = svm.predict(x_test_std)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
print('Support Vector Machine Results')
print('Accuracy: %.2f ' %accuracy_score(y_test,y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)
print('\nF1 Score: ', f1_score(y_test, y_pred, average="macro"))
print('Precision Score: ', precision_score(y_test, y_pred, average="macro"))
print("Recall Score: ",recall_score(y_test, y_pred, average="macro"))

Support Vector Machine Results
Accuracy: 0.98 

Confusion Matrix
 [[165   0]
 [  5 160]]

F1 Score:  0.9848450057405281
Precision Score:  0.9852941176470589
Recall Score:  0.9848484848484849


### K-Nearest Neighbors

In [12]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors =5, metric = 'minkowski')
knn.fit(x_train_std, y_train)
y_pred = knn.predict(x_test_std)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
print('K-Nearest Neighbors Results')
print('Accuracy: %.2f ' %accuracy_score(y_test,y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)
print('\nF1 Score: ', f1_score(y_test, y_pred, average="macro"))
print('Precision Score: ', precision_score(y_test, y_pred, average="macro"))
print("Recall Score: ",recall_score(y_test, y_pred, average="macro"))

K-Nearest Neighbors Results
Accuracy: 0.74 

Confusion Matrix
 [[ 82  83]
 [  3 162]]

F1 Score:  0.7231219512195122
Precision Score:  0.8129651860744298
Recall Score:  0.7393939393939394


### Deployment

In [13]:
test_set = ['You idiot', "I like you alot"]
new_test = bow_transformer.transform(test_set)

classifier.predict(new_test)

array(['cyberbullying', 'non-cyberbullying'], dtype='<U17')