In [1]:
import sys
sys.path.insert(0, '../scripts/')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [26]:
# import required packages
import numpy as np
import pandas as pd

# encoders
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

# models
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV 

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.metrics import recall_score, roc_auc_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus

In [5]:
# load train, validation and test sets into dataframe
df_train = pd.read_csv('../data/train_data.csv')
df_valid = pd.read_csv('../data/valid_data.csv')
df_test = pd.read_csv('../data/test_data.csv')

# combine train and validation sets
# shuffle dataframe randomly
df_train = pd.concat([df_train, df_valid]).sample(frac=1, random_state=42).reset_index(drop=True)

# shape of train and test sets: (rows, columns)
display(df_train.shape, df_test.shape)

(38153, 2)

(9539, 2)

In [7]:
df_train.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"Smh... “@tayyoung_: FUCK OBAMA, dumb ass nigger”",ethnicity
1,If one retired army officer does something bru...,religion
2,acting like the nazis didn’t take inspiration ...,age
3,Tje clps on peel scholl dont care abt cyber bu...,age
4,#parents Another key word 4 Bullying Preventio...,not_cyberbullying


In [10]:
# first 5 datapoints of train and test sets
display(df_train.head())
display(df_test.head())

Unnamed: 0,tweet_text,cyberbullying_type
0,"Smh... “@tayyoung_: FUCK OBAMA, dumb ass nigger”",ethnicity
1,If one retired army officer does something bru...,religion
2,acting like the nazis didn’t take inspiration ...,age
3,Tje clps on peel scholl dont care abt cyber bu...,age
4,#parents Another key word 4 Bullying Preventio...,not_cyberbullying


Unnamed: 0,tweet_text,cyberbullying_type
0,@Goree_JuhssGuns hahaha he ain't even worth my...,ethnicity
1,RT @hsaymssik: Sucks to have the smile wiped o...,gender
2,"Just a reminder, it's absolutely disgusting to...",ethnicity
3,RT @BuzzFeedUK: When you accidentally open you...,other_cyberbullying
4,Loving the look of the fritters! #mkr,not_cyberbullying


In [8]:
# extract independent features
# preprocess text column
X_train = preprocess_corpus(df_train.tweet_text)
X_test = preprocess_corpus(df_test.tweet_text)

# size of train & test sets
display(X_train.shape, X_test.shape)

(38153,)

(9539,)

In [11]:
# first 5 preprocessed tweets of train & test sets
display(X_train.head())
display(X_test.head())

0                  smh tayyoung fuck obama dumb nigger
1    one retired army officer something brutal whol...
2    act like nazi take inspiration gas chamber use...
3    tje clps peel scholl dont care abt cyber bully...
4    parent another key word bully prevention think...
Name: tweet_text, dtype: object

0    goree juhssguns hahaha even worth tweet dumb f...
1    hsaymssik suck smile wiped face huh kat glass ...
2    reminder absolutely disgust see people would p...
3            buzzfeeduk accidentally open front camera
4                                love look fritter mkr
Name: tweet_text, dtype: object

In [33]:
# encode the class labels
# extract dependent features
y_train = df_train.cyberbullying_type
y_test = df_test.cyberbullying_type

# spawn a label encoder
encoder = LabelEncoder()

# train the encoder on the train set labels
encoder.fit(y_train.values.ravel())

# transform the labels
y_train = pd.DataFrame(encoder.transform(y_train.values.ravel()), columns=['cyberbullying_type'])
y_test = pd.DataFrame(encoder.transform(y_test.values.ravel()), columns=['cyberbullying_type'])

# size of train & test set class labels
display(y_train.shape, y_test.shape)

(38153, 1)

(9539, 1)

In [34]:
# first 5 encoded class labels of train & test sets
display(y_train.head())
display(y_test.head())

Unnamed: 0,cyberbullying_type
0,1
1,5
2,0
3,0
4,3


Unnamed: 0,cyberbullying_type
0,1
1,2
2,1
3,4
4,3


# Bag of Words Transformation

In [35]:
# bag of words transformation
# instantiate a CountVectorizer
bow_vectorizer = CountVectorizer(min_df=15)

# train and construct bag of words
X_train_bow = pd.DataFrame(bow_vectorizer.fit_transform(X_train).toarray(), columns=bow_vectorizer.get_feature_names_out())
X_test_bow = pd.DataFrame(bow_vectorizer.transform(X_test).toarray(), columns=bow_vectorizer.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(X_train_bow.shape, X_test_bow.shape)

(38153, 3192)

(9539, 3192)

In [36]:
# first 5 datapoints of transformed train & validation sets
display(X_train_bow.head())
display(X_test_bow.head())

Unnamed: 0,aalwuhaib,abc,ability,able,abortion,absolute,absolutely,abt,abu,abuse,...,yousufpoosuf,youth,youtube,ypg,yrs,yup,zaibatsunews,zappe,zero,zionist
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,aalwuhaib,abc,ability,able,abortion,absolute,absolutely,abt,abu,abuse,...,yousufpoosuf,youth,youtube,ypg,yrs,yup,zaibatsunews,zappe,zero,zionist
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# utility function
# plot confusion matrix using sns heatmap
def plot_confusion_matrix(cf_matrix, title, xlabel='Predicted', ylabel='Actual', ticklabels=None, figsize=(8,4), fontdict={'fontsize':12}):
    
    # extract counts from confusion matrix
    group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix.flatten()]

    # calculate proportions from confusion matrix
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
    
    # create annotations for plot
    annotations = [f'{count}\n{percentage}\n'.format(count, percentage) for count, percentage in zip(group_counts, group_percentages)]
    annotations = np.asarray(annotations).reshape(cf_matrix.shape)

    plt.figure(figsize = figsize, dpi=100)
    ax = sns.heatmap(cf_matrix, annot=annotations, fmt='', cmap='Blues')

    # set title
    ax.set_title(title, fontdict=fontdict)
    
    # set axes labels
    ax.set_xlabel(xlabel, fontdict=fontdict)
    ax.set_ylabel(ylabel, fontdict=fontdict)

    # tick labels - List must be in alphabetical order
    if ticklabels:
        # set axes tick labels
        ax.xaxis.set_ticklabels(ticklabels)
        ax.yaxis.set_ticklabels(ticklabels)

    # display the visualization of the confusion matrix.
    plt.show()

In [41]:
X_train['abc']

KeyError: 'abc'

# Hyperparameter Tuning: XGBoost

In [38]:
# create a map of params to be optimized
params = {
    'learning_rate': (0.03, 0.1, 0.3),
    'max_depth': (4, 6, 8, 10),
    'n_estimators': (50, 100, 500)
}

# instantiate a GridSearchCV object with SVM model and params
grid_search_cv = GridSearchCV(XGBClassifier(), params, scoring = 'roc_auc', verbose=4, cv=5)

# perform grid search
grid_search_cv.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END learning_rate=0.03, max_depth=4, n_estimators=50;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.03, max_depth=4, n_estimators=50;, score=nan total time=   0.0s
[CV 3/5] END learning_rate=0.03, max_depth=4, n_estimators=50;, score=nan total time=   0.0s
[CV 4/5] END learning_rate=0.03, max_depth=4, n_estimators=50;, score=nan total time=   0.0s
[CV 5/5] END learning_rate=0.03, max_depth=4, n_estimators=50;, score=nan total time=   0.0s
[CV 1/5] END learning_rate=0.03, max_depth=4, n_estimators=100;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.03, max_depth=4, n_estimators=100;, score=nan total time=   0.0s
[CV 3/5] END learning_rate=0.03, max_depth=4, n_estimators=100;, score=nan total time=   0.0s
[CV 4/5] END learning_rate=0.03, max_depth=4, n_estimators=100;, score=nan total time=   0.0s
[CV 5/5] END learning_rate=0.03, max_depth=4, n_estimators=100;, score=nan total time=   0.0s
[CV

[CV 3/5] END learning_rate=0.1, max_depth=8, n_estimators=50;, score=nan total time=   0.0s
[CV 4/5] END learning_rate=0.1, max_depth=8, n_estimators=50;, score=nan total time=   0.0s
[CV 5/5] END learning_rate=0.1, max_depth=8, n_estimators=50;, score=nan total time=   0.0s
[CV 1/5] END learning_rate=0.1, max_depth=8, n_estimators=100;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.1, max_depth=8, n_estimators=100;, score=nan total time=   0.0s
[CV 3/5] END learning_rate=0.1, max_depth=8, n_estimators=100;, score=nan total time=   0.0s
[CV 4/5] END learning_rate=0.1, max_depth=8, n_estimators=100;, score=nan total time=   0.0s
[CV 5/5] END learning_rate=0.1, max_depth=8, n_estimators=100;, score=nan total time=   0.0s
[CV 1/5] END learning_rate=0.1, max_depth=8, n_estimators=500;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.1, max_depth=8, n_estimators=500;, score=nan total time=   0.0s
[CV 3/5] END learning_rate=0.1, max_depth=8, n_estimators=500;, score=nan

ValueError: 
All the 180 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ashut\Dev\languages\python\python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ashut\Dev\languages\python\python310\lib\site-packages\xgboost\core.py", line 532, in inner_f
    return f(**kwargs)
  File "C:\Users\ashut\Dev\languages\python\python310\lib\site-packages\xgboost\sklearn.py", line 1382, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\ashut\Dev\languages\python\python310\lib\site-packages\xgboost\sklearn.py", line 401, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C:\Users\ashut\Dev\languages\python\python310\lib\site-packages\xgboost\sklearn.py", line 1396, in <lambda>
    create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
  File "C:\Users\ashut\Dev\languages\python\python310\lib\site-packages\xgboost\core.py", line 532, in inner_f
    return f(**kwargs)
  File "C:\Users\ashut\Dev\languages\python\python310\lib\site-packages\xgboost\core.py", line 643, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
  File "C:\Users\ashut\Dev\languages\python\python310\lib\site-packages\xgboost\data.py", line 899, in dispatch_data_backend
    return _from_pandas_series(
  File "C:\Users\ashut\Dev\languages\python\python310\lib\site-packages\xgboost\data.py", line 392, in _from_pandas_series
    _invalid_dataframe_dtype(data)
  File "C:\Users\ashut\Dev\languages\python\python310\lib\site-packages\xgboost\data.py", line 247, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`.


In [30]:
y_train.dtypes

AttributeError: 'numpy.ndarray' object has no attribute 'dtypes'

gender                 6442
religion               6432
age                    6389
ethnicity              6358
not_cyberbullying      6321
other_cyberbullying    6211
Name: cyberbullying_type, dtype: int64