In [1]:
import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [2]:
df_train = pd.read_csv(r'C:\Users\Checkout\Downloads\cyberbullying-classification-master\data\train_data.csv')

display(df_train.shape)

display(df_train.head())

display(df_train.sample(5))


(28614, 2)

Unnamed: 0,clean_tweet,cyberbullying_type
0,hate ppl high school used bully hot omg love m...,age
1,kat andre asshole omg mkr,not_cyberbullying
2,new access trading cause need high level opini...,age
3,fuck david duke racist think america belong du...,ethnicity
4,may say lot hate apologetic army hope choke ev...,other_cyberbullying


Unnamed: 0,clean_tweet,cyberbullying_type
787,carrie dir brian palma unpopular friendless mi...,age
21207,dwayne wade cousin get killed blame trump inst...,ethnicity
3922,publicly apologize people bullied high school ...,age
6345,correct woman fighting right treated thinking ...,religion
28562,dumb nigger cincy fuck look face whitepower,ethnicity


In [14]:
import pickle

cv1 = pickle.load(open("cv.pickel", "rb"))


In [15]:
X_train = df_train.clean_tweet
Y_train = df_train.cyberbullying_type

print(X_train.shape, Y_train.shape)


#we have to use fit_transform for train data and use only transform for validation data
X_train_cv = cv1.fit_transform(df_train['clean_tweet'].values.astype('U'))

X_train_cv = X_train_cv.toarray()

display(X_train_cv.shape)

print(type(X_train_cv))

le = LabelEncoder()

Y_train_le = pd.DataFrame(le.fit_transform(Y_train), columns=['encoded_cyberbullying_type'])

map_labels = pd.DataFrame(
    {
        'labels': le.classes_,
        'encoded_labels': le.transform(le.classes_)
    },
    columns=['labels', 'encoded_labels']
)



(28614,) (28614,)


(28614, 100)

<class 'numpy.ndarray'>


In [16]:
df_valid = pd.read_csv(r'C:\Users\Checkout\Downloads\cyberbullying-classification-master\data\train_data.csv')

display(df_valid.shape)

display(df_valid['cyberbullying_type'].value_counts())


df_valid.head()


(28614, 2)

gender                 4795
not_cyberbullying      4792
religion               4791
age                    4786
ethnicity              4766
other_cyberbullying    4684
Name: cyberbullying_type, dtype: int64

Unnamed: 0,clean_tweet,cyberbullying_type
0,hate ppl high school used bully hot omg love m...,age
1,kat andre asshole omg mkr,not_cyberbullying
2,new access trading cause need high level opini...,age
3,fuck david duke racist think america belong du...,ethnicity
4,may say lot hate apologetic army hope choke ev...,other_cyberbullying


In [17]:
X_valid = df_valid.clean_tweet
Y_valid = df_valid.cyberbullying_type

print(X_valid.shape, Y_valid.shape)

X_valid_cv = cv1.fit_transform(df_valid['clean_tweet'].values.astype('U'))

display(X_valid_cv.shape)
X_valid_cv = X_valid_cv.toarray()

Y_valid_le = pd.DataFrame(le.fit_transform(Y_valid), columns=['encoded_cyberbullying_type'])

(28614,) (28614,)


(28614, 100)

In [18]:
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': GradientBoostingClassifier()
}

results = {
    'time_to_train': [],
    'accuracy': [],
    'f1': []
}

for name, model in list(models.items()):
    start_time = time.time()
    
    model.fit(X_train_cv, Y_train_le.values.ravel())
    
    end_time = time.time()
    
    Y_pred = model.predict(X_valid_cv)
    
    results['time_to_train'].append(end_time - start_time)
    results['accuracy'].append(accuracy_score(Y_valid_le, Y_pred))
    results['f1'].append(f1_score(Y_valid_le, Y_pred, average=None))
    
display(pd.DataFrame(results['time_to_train'], index=models.keys(), columns=['Time (seconds)']))

display(pd.DataFrame(results['accuracy'], index=models.keys(), columns=['Accuracy']))

display(pd.DataFrame(np.asarray(results['f1']).T, columns=models.keys(), index=le.classes_))




Unnamed: 0,Time (seconds)
Gaussian Naive Bayes,0.077147
Linear SVM,2.56819
Kernel SVM,80.263761
XGBoost,123.879935


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.730342
Linear SVM,0.803802
Kernel SVM,0.822639
XGBoost,0.813832


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
age,0.819663,0.969253,0.984817,0.98232
ethnicity,0.922629,0.967206,0.987735,0.986889
gender,0.806074,0.83113,0.844269,0.837118
not_cyberbullying,0.234223,0.451789,0.508186,0.475454
other_cyberbullying,0.574015,0.635414,0.658829,0.646913
religion,0.904752,0.940231,0.944153,0.943283
