### Import Libraries

In [65]:
import numpy as np 
import pandas as pd 
import os
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

### Import Dataset

In [66]:
df = pd.read_csv("test_data.csv")
print(df)

                                                   Chat              Label
0                            is fatty streaming tonight      cyberbullying
1                               is still fat and boring      cyberbullying
2        lets be real if a girl watch twitch she is fat      cyberbullying
3                              good morning fatty fucks      cyberbullying
4     my lab manager is a piece of shit tried to tak...      cyberbullying
...                                                 ...                ...
1195            drunk is my favorate thing to watch now  non-cyberbullying
1196             i litterly just got back what happened  non-cyberbullying
1197                       Rather funny than gnomes LUL  non-cyberbullying
1198                                  yeah shes smashed  non-cyberbullying
1199  Haha I just cant even right now this is soo mu...  non-cyberbullying

[1200 rows x 2 columns]


### Data Prepocessing

In [67]:
corpus = []

for i in range(0, len(df)):
    #only alphabet allowed
    review = re.sub('[^a-zA-Z]', ' ', df['Chat'][i])
    #set to lowercase
    review = review.lower()
    #tokenize
    review = review.split()
    review = ' '.join(review)
    corpus.append(review)

#bag of words transformer
bow_transformer = CountVectorizer(stop_words='english')
bow_transformer = bow_transformer.fit(corpus)

print('Length of the Vocabulary: ',len(bow_transformer.vocabulary_))
messages_bow = bow_transformer.transform(corpus)
tfidf_transformer = TfidfTransformer().fit(messages_bow)
X = tfidf_transformer.transform(messages_bow)

y = []
for row in df["Label"]:
    y.append(row)

Length of the Vocabulary:  1802


In [68]:
print(X)

  (0, 1598)	0.6238220562606246
  (0, 1486)	0.5782484390789163
  (0, 534)	0.5258086960345586
  (1, 529)	0.5357315061116403
  (1, 168)	0.8443883901140242
  (2, 1708)	0.40553282880344904
  (2, 1638)	0.43251899188951504
  (2, 1227)	0.40553282880344904
  (2, 890)	0.450962166007829
  (2, 640)	0.44116271779754834
  (2, 529)	0.29329533312896516
  (3, 1025)	0.5535288408980124
  (3, 657)	0.36588208558684243
  (3, 605)	0.5535288408980124
  (3, 534)	0.5033308494765925
  (4, 1762)	0.20370758921586832
  (4, 1614)	0.2399047183787652
  (4, 1562)	0.2647069281573192
  (4, 1454)	0.2647069281573192
  (4, 1451)	0.2647069281573192
  (4, 1360)	0.2647069281573192
  (4, 1350)	0.15174306000294704
  (4, 1313)	0.2647069281573192
  (4, 1232)	0.18231577067865057
  (4, 1136)	0.2319201902357586
  :	:
  (1193, 947)	0.449231900466714
  (1193, 425)	0.5275808298728863
  (1194, 1387)	0.5878910274430733
  (1194, 615)	0.5556692178840351
  (1194, 199)	0.5878910274430733
  (1195, 1708)	0.4316655374271083
  (1195, 1558)	0.4258

In [69]:
#print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=bow_transformer.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
print(df_idf)

          idf_weights
ability      7.397763
able         7.397763
absolute     6.992298
abusers      7.397763
account      6.481472
...               ...
yt           7.397763
yum          7.397763
yummy        7.397763
zenbob       7.397763
zone         7.397763

[1802 rows x 1 columns]


### Model selection - setting train and test size

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

#from sklearn.preprocessing import StandardScaler
#sc = StandardScaler(with_mean=False)

#x_train_std = sc.fit_transform(X_train)
#x_test_std = sc.transform(X_test)

### Naive Bayes

In [71]:
from sklearn.naive_bayes  import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print('Naive Bayes Results:')
print(classification_report(y_test, y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)

Naive Bayes Results:
                   precision    recall  f1-score   support

    cyberbullying       0.82      0.88      0.85       152
non-cyberbullying       0.87      0.80      0.84       148

         accuracy                           0.84       300
        macro avg       0.85      0.84      0.84       300
     weighted avg       0.85      0.84      0.84       300


Confusion Matrix
 [[134  18]
 [ 29 119]]


### Decision Tree

In [72]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print('Decision Tree Results:')
print(classification_report(y_test, y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)

Decision Tree Results:
                   precision    recall  f1-score   support

    cyberbullying       0.85      0.76      0.80       152
non-cyberbullying       0.77      0.86      0.81       148

         accuracy                           0.81       300
        macro avg       0.81      0.81      0.81       300
     weighted avg       0.81      0.81      0.81       300


Confusion Matrix
 [[115  37]
 [ 21 127]]


### Random Forest

In [73]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print('Random Forest Results:')
print(classification_report(y_test, y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)

Random Forest Results:
                   precision    recall  f1-score   support

    cyberbullying       0.87      0.68      0.76       152
non-cyberbullying       0.73      0.89      0.80       148

         accuracy                           0.79       300
        macro avg       0.80      0.79      0.78       300
     weighted avg       0.80      0.79      0.78       300


Confusion Matrix
 [[104  48]
 [ 16 132]]




### Logistic Regression

In [74]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train_std, y_train)
y_pred = lr.predict(x_test_std)

from sklearn.metrics import classification_report, confusion_matrix
print('Logistic Regression Results:')
print(classification_report(y_test, y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)

Logistic Regression Results:
                   precision    recall  f1-score   support

    cyberbullying       0.78      0.74      0.76       152
non-cyberbullying       0.75      0.79      0.77       148

         accuracy                           0.76       300
        macro avg       0.76      0.76      0.76       300
     weighted avg       0.76      0.76      0.76       300


Confusion Matrix
 [[112  40]
 [ 31 117]]




### Support Vector Machines

In [75]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_train_std, y_train)
y_pred = svm.predict(x_test_std)

from sklearn.metrics import classification_report, confusion_matrix
print('Support Vector Machine Results:')
print(classification_report(y_test, y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)

Support Vector Machine Results:




                   precision    recall  f1-score   support

    cyberbullying       0.64      0.78      0.71       152
non-cyberbullying       0.71      0.55      0.62       148

         accuracy                           0.67       300
        macro avg       0.68      0.67      0.66       300
     weighted avg       0.68      0.67      0.67       300


Confusion Matrix
 [[119  33]
 [ 66  82]]


### K-Nearest Neighbors

In [76]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train_std, y_train)
y_pred = knn.predict(x_test_std)

from sklearn.metrics import classification_report, confusion_matrix
print('K-Nearest Neighbor Results:')
print(classification_report(y_test, y_pred))
confusion_matrix = confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix\n", confusion_matrix)

K-Nearest Neighbor Results:
                   precision    recall  f1-score   support

    cyberbullying       1.00      0.15      0.26       152
non-cyberbullying       0.53      1.00      0.70       148

         accuracy                           0.57       300
        macro avg       0.77      0.58      0.48       300
     weighted avg       0.77      0.57      0.48       300


Confusion Matrix
 [[ 23 129]
 [  0 148]]


### Deployment

### Naive Bayes testing

In [77]:
test_set = ['i wish you were dead', 'i wish you were my friend']
new_test = bow_transformer.transform(test_set)

classifier.predict(new_test)

array(['cyberbullying', 'non-cyberbullying'], dtype='<U17')

### Decision Tree testing

In [78]:
test_set = ['Fuck you', 'Your amazing']
new_test = bow_transformer.transform(test_set)

tree.predict(new_test)

array(['cyberbullying', 'non-cyberbullying'], dtype='<U17')

### Random Forest testing

In [79]:
test_set = ['Fuck you', 'Your amazing']
new_test = bow_transformer.transform(test_set)

forest.predict(new_test)

array(['cyberbullying', 'non-cyberbullying'], dtype='<U17')

### Logistic Regression testing

In [80]:
test_set = ['Fuck you', 'Your amazing']
new_test = bow_transformer.transform(test_set)

lr.predict(new_test)

array(['cyberbullying', 'non-cyberbullying'], dtype='<U17')

### Support Vector Machine testing

In [81]:
test_set = ['Fuck you', 'Your amazing']
new_test = bow_transformer.transform(test_set)

svm.predict(new_test)

array(['cyberbullying', 'cyberbullying'], dtype='<U17')

### K nearest neighbor testing

In [82]:
test_set = ['Fuck you', 'Your amazing']
new_test = bow_transformer.transform(test_set)

knn.predict(new_test)

array(['non-cyberbullying', 'non-cyberbullying'], dtype='<U17')