In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
tweets_df = pd.read_csv("data/cyberbullying_tweets.csv")
tweets_df

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying
...,...,...
47687,"Black ppl aren't expected to do anything, depe...",ethnicity
47688,Turner did not withhold his disappointment. Tu...,ethnicity
47689,I swear to God. This dumb nigger bitch. I have...,ethnicity
47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,ethnicity


In [3]:
def convertToBullyClassification(cyberbullying_type):
    return cyberbullying_type != "not_cyberbullying"

In [4]:
tweets_df["cyberbullying_type"] = tweets_df["cyberbullying_type"].apply(convertToBullyClassification)
tweets_df.rename(columns={"cyberbullying_type":"is_cyberbullying"})
tweets_df

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",False
1,Why is #aussietv so white? #MKR #theblock #ImA...,False
2,@XochitlSuckkks a classy whore? Or more red ve...,False
3,"@Jason_Gio meh. :P thanks for the heads up, b...",False
4,@RudhoeEnglish This is an ISIS account pretend...,False
...,...,...
47687,"Black ppl aren't expected to do anything, depe...",True
47688,Turner did not withhold his disappointment. Tu...,True
47689,I swear to God. This dumb nigger bitch. I have...,True
47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,True


In [5]:
print(tweets_df['cyberbullying_type'].value_counts().get(True, 0))
print(tweets_df['cyberbullying_type'].value_counts().get(False, 0))

39747
7945


In [6]:
# Data splitting
text_features = "tweet_text"
target_feature = "cyberbullying_type"
data = tweets_df

train_data, test_data = train_test_split(data, test_size=0.2, random_state=123)

x_train = train_data.drop(columns=target_feature)
y_train = train_data["cyberbullying_type"]

x_test = test_data.drop(columns=target_feature)
y_test = test_data["cyberbullying_type"]

### Text Processing
Define TF-IDF to pre-process the text data into vector representations

In [7]:
# TF-IDF definition
TFIDF = TfidfVectorizer(lowercase = False)

### Model Definition
Defining transformers for text pre-processing and model pipelines

In [8]:
# TF-IDF pre-processor transformer
TFIDF_preprocessor = ColumnTransformer(
    transformers=[
        ('text', TFIDF, text_features)
    ],
    remainder='passthrough'
)

In [9]:
# Model pipelines

# LinearSVM + TF-IDF pipeline
linear_svm_tfidf = Pipeline(
    steps = [
        ("transformer", TFIDF_preprocessor),
        ("linearsvm", LinearSVC(random_state=123))
    ]
)

In [10]:
# LR + TF-IDF pipeline
lr_tfidf = Pipeline(
    steps = [
        ("transformer", TFIDF_preprocessor),
        ("lr", LogisticRegression(random_state=123,solver='liblinear'))
    ]
)

### Hyperparameter Settings
Hyperparameters for Logistic Regression (LR) and LinearSVM classification models

In [12]:
CV = 5

svm_param_grid = {
    'svm__C': [1, 10, 100, 1000],
    'svm__gamma': [1, 0.1, 0.001, 0.0001],
    'svm__kernel': ['linear', 'rbf', 'poly'],
}

linear_svm_param_grid = {
    'linearsvm__C': [0.001, 0.01, 1, 10, 100],
    'linearsvm__max_iter': [10000, 20000, 50000],
    'linearsvm__tol': [1e-2, 1e-4, 1e-6],
}

lr_param_grid = [
    {
    'lr__C': [0.1, 1, 10, 100],
    'lr__penalty': ['l2'],
    'lr__max_iter': [250, 500, 1000],
    'lr__fit_intercept': [True, False],
    'lr__solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag']
    },
    {
    'lr__C': [0.1, 1, 10, 100],
    'lr__penalty': ['l1', 'l2'],
    'lr__max_iter': [250, 500, 1000],
    'lr__fit_intercept': [True, False],
    'lr__solver': ['liblinear']
    },
    {
    'lr__C': [0.1, 1, 10, 100],
    'lr__penalty': ['l1', 'l2', 'elasticnet'],
    'lr__max_iter': [250, 500, 1000],
    'lr__fit_intercept': [True, False],
    'lr__solver': ['saga']
    },
    {
    'lr__penalty': [None],
    'lr__max_iter': [250, 500, 1000],
    'lr__fit_intercept': [True, False],
    'lr__solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
    }
]

### LinearSVM Hyperparameter Tuning
Hyperparameter tuning using GridSearchCV for `linear_svm_tfidf`

In [12]:
linear_svm_tfidf_grid = GridSearchCV(linear_svm_tfidf, linear_svm_param_grid, cv=CV, n_jobs=-1, verbose=2)
linear_svm_tfidf_grid.fit(x_train, y_train)
linear_svm_tfidf_results = pd.DataFrame(linear_svm_tfidf_grid.cv_results_)
linear_svm_tfidf_results.to_csv('linear_svm_tfidf_results.csv', index=False)

sorted_linear_svm_tfidf_results = linear_svm_tfidf_results.sort_values(by=["rank_test_score", "mean_fit_time"], ascending=[True, True])
display(sorted_linear_svm_tfidf_results)
linear_svm_tfidf_best_params = sorted_linear_svm_tfidf_results.iloc[0]["params"]
print(f"Best Params: {linear_svm_tfidf_best_params}")

Fitting 5 folds for each of 45 candidates, totalling 225 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_linearsvm__C,param_linearsvm__max_iter,param_linearsvm__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
25,1.125862,0.054124,0.196133,0.005063,1.0,50000,0.0001,"{'linearsvm__C': 1, 'linearsvm__max_iter': 500...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
19,1.171965,0.077434,0.196504,0.023774,1.0,10000,0.0001,"{'linearsvm__C': 1, 'linearsvm__max_iter': 100...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
26,1.184356,0.025232,0.193134,0.0042,1.0,50000,1e-06,"{'linearsvm__C': 1, 'linearsvm__max_iter': 500...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
20,1.212296,0.034605,0.189504,0.023106,1.0,10000,1e-06,"{'linearsvm__C': 1, 'linearsvm__max_iter': 100...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
22,1.24668,0.026106,0.216429,0.014913,1.0,20000,0.0001,"{'linearsvm__C': 1, 'linearsvm__max_iter': 200...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
23,1.336685,0.019837,0.189928,0.011238,1.0,20000,1e-06,"{'linearsvm__C': 1, 'linearsvm__max_iter': 200...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
24,1.050351,0.014553,0.193248,0.008567,1.0,50000,0.01,"{'linearsvm__C': 1, 'linearsvm__max_iter': 500...",0.854934,0.85441,0.858603,0.863565,0.856094,0.857521,0.00335,7
18,1.060702,0.039689,0.209509,0.008429,1.0,10000,0.01,"{'linearsvm__C': 1, 'linearsvm__max_iter': 100...",0.854934,0.85441,0.858603,0.863565,0.856094,0.857521,0.00335,7
21,1.073329,0.05851,0.205691,0.019267,1.0,20000,0.01,"{'linearsvm__C': 1, 'linearsvm__max_iter': 200...",0.854934,0.85441,0.858603,0.863565,0.856094,0.857521,0.00335,7
9,0.971898,0.01876,0.186342,0.009151,0.01,10000,0.01,"{'linearsvm__C': 0.01, 'linearsvm__max_iter': ...",0.842747,0.842485,0.840257,0.844954,0.843906,0.84287,0.001575,10


Best Params: {'linearsvm__C': 1, 'linearsvm__max_iter': 50000, 'linearsvm__tol': 0.0001}


### Logistic Regression (LR) Hyperparameter Tuning
Hyperparameter tuning using GridSearchCV for `lr_tfidf`

In [None]:
lr_tfidf_grid = GridSearchCV(lr_tfidf, lr_param_grid, cv=CV, n_jobs=-1, verbose=2)
lr_tfidf_grid.fit(x_train, y_train)
lr_tfidf_results = pd.DataFrame(lr_tfidf_grid.cv_results_)
lr_tfidf_results.to_csv('lr_tfidf_results.csv', index=False)

sorted_lr_tfidf_results = lr_tfidf_results.sort_values(by=["rank_test_score", "mean_fit_time"], ascending=[True, True])
display(sorted_lr_tfidf_results)
lr_tfidf_best_params = sorted_lr_tfidf_results.iloc[0]["params"]
print(f"Best Params: {lr_tfidf_best_params}")

Fitting 5 folds for each of 246 candidates, totalling 1230 fits


### Best Models
Best parameters for the LinearSVM and LR models

In [14]:
# Best LinearSVM
linear_svm_tfidf.set_params(**linear_svm_tfidf_best_params)
linear_svm_tfidf.fit(x_train, y_train)

In [15]:
# Best LR
lr_tfidf.set_params(**lr_tfidf_best_params)
lr_tfidf.fit(x_train, y_train)