In [1]:
import sys
sys.path.insert(0, '../../scripts/')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# models
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus

In [3]:
# load train set into dataframe
df_train = pd.read_csv('../../data/train_data.csv')
df_valid = pd.read_csv('../../data/valid_data.csv')

# shape: (rows, columns)
display(df_train.shape, df_valid.shape)

# first 5 datapoints of train and validation sets
display(df_train.head())
display(df_valid.head())

(28614, 2)

(9539, 2)

Unnamed: 0,tweet_text,cyberbullying_type
0,i hate ppl from high school y’all used to bull...,age
1,Kat and Andre are such assholes OMG #mkr,not_cyberbullying
2,"if she is new,she will not have access to go t...",age
3,Fuck David duke racist who thinks America belo...,ethnicity
4,I May not say it a lot but I hate apologetic A...,other_cyberbullying


Unnamed: 0,tweet_text,cyberbullying_type
0,"Fucking Slut ""@CallMeKatiee__ DUMB BITCH. ""@__...",ethnicity
1,@TheRealJacquet it's not a fucking excuse it's...,ethnicity
2,@iamyaokhari Men HATE getting the last word. T...,gender
3,@sibbysoyabean I have several strands of pearl...,not_cyberbullying
4,@AshForSyria @TheMoeDee @RazanSpeaks Or are yo...,not_cyberbullying


In [4]:
# extract independent features
# preprocess text column
x_train = preprocess_corpus(df_train.tweet_text)
x_valid = preprocess_corpus(df_valid.tweet_text)

# extract dependent features
y_train = df_train.cyberbullying_type
y_valid = df_valid.cyberbullying_type

# shape of train and validation features: (rows, columns)
display(x_train.shape, y_train.shape)
display(x_valid.shape, y_valid.shape)

(28614,)

(28614,)

(9539,)

(9539,)

In [5]:
# bag of words transformation
# instantiate a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=10)

# train and construct bag of words
x_train_tfidf = pd.DataFrame(tfidf_vectorizer.fit_transform(x_train).toarray(), columns=tfidf_vectorizer.get_feature_names_out())
x_valid_tfidf = pd.DataFrame(tfidf_vectorizer.transform(x_valid).toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(x_train_tfidf.shape, x_valid_tfidf.shape)

# first 5 datapoints of transformed train & validation sets
display(x_train_tfidf.head())
display(x_valid_tfidf.head())

(28614, 3554)

(9539, 3554)

Unnamed: 0,aalwuhaib,abandon,ability,able,abortion,absolute,absolutely,abt,abu,abuse,...,youth,youtube,ypg,yrs,yup,zaibatsunews,zappe,zero,zionist,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,aalwuhaib,abandon,ability,able,abortion,absolute,absolutely,abt,abu,abuse,...,youth,youtube,ypg,yrs,yup,zaibatsunews,zappe,zero,zionist,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# encode class labels
# spawn a labelencoder
encoder = LabelEncoder()

y_train_le = encoder.fit_transform(y_train.values.ravel())
y_valid_le = encoder.transform(y_valid.values.ravel())

display(y_train_le.shape)
display(y_valid_le.shape)

(28614,)

(9539,)

In [7]:
# train and evaluate 3 models: NaiveBayes, SVM and XGBoost
# instantiate all models
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': XGBClassifier()
}

# initializing a result map
results = {
    'time_to_train': [],
    'time_to_test': [],
    'accuracy': [],
    'f1': []
}

# train and test each models
for model_name, model in list(models.items()):
    # training start
    print('training', model_name + '...')
    start_time = time.time()
    
    model.fit(x_train_tfidf, y_train_le)
    
    # training end
    end_time = time.time()
    time_to_train = end_time - start_time
    print('training completed:', '{:.2f}'.format(time_to_train), 'seconds')
    
    # testing start
    print('testing...')
    start_time = time.time()
    
    # make predictions on validation set
    y_pred = model.predict(x_valid_tfidf)
    
    # testing end
    end_time = time.time()
    time_to_test = end_time - start_time
    print('testing completed:', '{:.2f}'.format(time_to_test), 'seconds\n')
    
    # add results to result map
    results['time_to_train'].append(time_to_train)
    results['time_to_test'].append(time_to_test)
    results['accuracy'].append(accuracy_score(y_valid_le, y_pred))
    results['f1'].append(f1_score(y_valid_le, y_pred, average=None))

training Gaussian Naive Bayes...
training completed: 1.63 seconds
testing...
testing completed: 1.96 seconds

training Linear SVM...
training completed: 0.91 seconds
testing...
testing completed: 0.11 seconds

training Kernel SVM...
training completed: 1134.42 seconds
testing...
testing completed: 573.98 seconds

training XGBoost...
training completed: 348.07 seconds
testing...
testing completed: 0.41 seconds



In [8]:
# display the results
# time to train & test
display(pd.DataFrame(
        {
            'Train': results['time_to_train'],
            'Test': results['time_to_test']
        },
        index=models.keys(),
))

# accuracy 
display(pd.DataFrame(results['accuracy'], index=models.keys(), columns=['Accuracy']))

# f1 score
display(pd.DataFrame(np.asarray(results['f1']).T, columns=models.keys(), index=encoder.classes_))

Unnamed: 0,Train,Test
Gaussian Naive Bayes,1.627168,1.960383
Linear SVM,0.91359,0.110055
Kernel SVM,1134.416128,573.977958
XGBoost,348.072153,0.406534


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.534857
Linear SVM,0.823252
Kernel SVM,0.83101
XGBoost,0.835203


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
age,0.483242,0.955501,0.966883,0.980787
ethnicity,0.733528,0.973973,0.976875,0.988969
gender,0.475927,0.85857,0.862314,0.869204
not_cyberbullying,0.270506,0.538624,0.556998,0.536842
other_cyberbullying,0.466019,0.635266,0.650826,0.655835
religion,0.733103,0.948328,0.954168,0.958154
