In [1]:
import sys
sys.path.insert(0, '../../scripts/')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

# models
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus

In [3]:
# load train set into dataframe
df_train = pd.read_csv('../../data/train_data.csv')
df_valid = pd.read_csv('../../data/valid_data.csv')

# shape: (rows, columns)
display(df_train.shape, df_valid.shape)

# first 5 datapoints of train and validation sets
display(df_train.head())
display(df_valid.head())

(28614, 2)

(9539, 2)

Unnamed: 0,tweet_text,cyberbullying_type
0,i hate ppl from high school y’all used to bull...,age
1,Kat and Andre are such assholes OMG #mkr,not_cyberbullying
2,"if she is new,she will not have access to go t...",age
3,Fuck David duke racist who thinks America belo...,ethnicity
4,I May not say it a lot but I hate apologetic A...,other_cyberbullying


Unnamed: 0,tweet_text,cyberbullying_type
0,"Fucking Slut ""@CallMeKatiee__ DUMB BITCH. ""@__...",ethnicity
1,@TheRealJacquet it's not a fucking excuse it's...,ethnicity
2,@iamyaokhari Men HATE getting the last word. T...,gender
3,@sibbysoyabean I have several strands of pearl...,not_cyberbullying
4,@AshForSyria @TheMoeDee @RazanSpeaks Or are yo...,not_cyberbullying


In [4]:
# extract independent features
# preprocess text column
x_train = preprocess_corpus(df_train.tweet_text)
x_valid = preprocess_corpus(df_valid.tweet_text)

# extract dependent features
y_train = df_train.cyberbullying_type
y_valid = df_valid.cyberbullying_type

# shape of train and validation features: (rows, columns)
display(x_train.shape, y_train.shape)
display(x_valid.shape, y_valid.shape)

(28614,)

(28614,)

(9539,)

(9539,)

In [5]:
# bag of words transformation
# instantiate a CountVectorizer
bow_vectorizer = CountVectorizer(min_df=20)

# train and construct bag of words
x_train_bow = pd.DataFrame(bow_vectorizer.fit_transform(x_train).toarray(), columns=bow_vectorizer.get_feature_names_out())
x_valid_bow = pd.DataFrame(bow_vectorizer.transform(x_valid).toarray(), columns=bow_vectorizer.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(x_train_bow.shape, x_valid_bow.shape)

# first 5 datapoints of transformed train & validation sets
display(x_train_bow.head())
display(x_valid_bow.head())

(28614, 2235)

(9539, 2235)

Unnamed: 0,ability,able,abortion,absolute,absolutely,abt,abuse,abused,abusive,accept,...,yesyouresexist,yet,yoho,young,younger,youre,youtube,yup,zappe,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,ability,able,abortion,absolute,absolutely,abt,abuse,abused,abusive,accept,...,yesyouresexist,yet,yoho,young,younger,youre,youtube,yup,zappe,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# encode class labels
# spawn a labelencoder
encoder = LabelEncoder()

y_train_le = encoder.fit_transform(y_train.values.ravel())
y_valid_le = encoder.transform(y_valid.values.ravel())

display(y_train_le.shape)
display(y_valid_le.shape)

(28614,)

(9539,)

In [7]:
# train and evaluate 3 models: NaiveBayes, SVM and XGBoost
# instantiate all models
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': XGBClassifier()
}

# initializing a result map
results = {
    'time_to_train': [],
    'time_to_test': [],
    'accuracy': [],
    'f1': []
}

# train and test each models
for model_name, model in list(models.items()):
    # training start
    print('training', model_name + '...')
    start_time = time.time()
    
    model.fit(x_train_bow, y_train_le)
    
    # training end
    end_time = time.time()
    time_to_train = end_time - start_time
    print('training completed:', '{:.2f}'.format(time_to_train), 'seconds')
    
    # testing start
    print('testing...')
    start_time = time.time()
    
    # make predictions on validation set
    y_pred = model.predict(x_valid_bow)
    
    # testing end
    end_time = time.time()
    time_to_test = end_time - start_time
    print('testing completed:', '{:.2f}'.format(time_to_test), 'seconds\n')
    
    # add results to result map
    results['time_to_train'].append(time_to_train)
    results['time_to_test'].append(time_to_test)
    results['accuracy'].append(accuracy_score(y_valid_le, y_pred))
    results['f1'].append(f1_score(y_valid_le, y_pred, average=None))

training Gaussian Naive Bayes...
training completed: 1.24 seconds
testing...
testing completed: 1.81 seconds

training Linear SVM...




training completed: 4.81 seconds
testing...
testing completed: 0.13 seconds

training Kernel SVM...
training completed: 843.33 seconds
testing...
testing completed: 417.61 seconds

training XGBoost...
training completed: 340.07 seconds
testing...
testing completed: 0.43 seconds



In [8]:
# display the results
# time to train & test
display(pd.DataFrame(
        {
            'Train': results['time_to_train'],
            'Test': results['time_to_test']
        },
        index=models.keys(),
))

# accuracy 
display(pd.DataFrame(results['accuracy'], index=models.keys(), columns=['Accuracy']))

# f1 score
display(pd.DataFrame(np.asarray(results['f1']).T, columns=models.keys(), index=encoder.classes_))

Unnamed: 0,Train,Test
Gaussian Naive Bayes,1.237995,1.805003
Linear SVM,4.805,0.133999
Kernel SVM,843.328363,417.606114
XGBoost,340.06525,0.432995


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.591571
Linear SVM,0.821994
Kernel SVM,0.83688
XGBoost,0.841807


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
age,0.647507,0.961766,0.978858,0.982312
ethnicity,0.789411,0.981441,0.985818,0.991485
gender,0.500418,0.850957,0.865959,0.869093
not_cyberbullying,0.252182,0.553308,0.533919,0.543743
other_cyberbullying,0.505516,0.619641,0.673936,0.673346
religion,0.788925,0.943752,0.951331,0.962351
