### Process Overview
1. Import the dataset
2. Split dataset into training and testing (no need for a validation dataset because we will be using cross-validation)
3. Process the text data using TF-IDF
4. Train model using LR or SVM
5. Perform k-fold cross validation for hyperparameter tuning
6. Test each trained model
7. Plot confusion matrices & metrics (accuracy, precision, recall, f1-score)

### Imports
Include imports needed for the process

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### Dataset
Import the dataset, describe the dataset, and data splitting

In [None]:
from google.colab import drive

drive.mount('/content/drive/')

# Data import
data = pd.read_csv('/content/drive/MyDrive/cyberbullying_tweets.csv')
!ls /content/drive/MyDrive/cyberbullying_tweets.csv

# Convert cyberbullying_type column to be True or False
def convertToBullyClassification(cyberbullying_type):
  return cyberbullying_type != "not_cyberbullying"

data["cyberbullying_type"] = data["cyberbullying_type"].apply(convertToBullyClassification)
data.rename(columns={"cyberbullying_type":"is_cyberbullying"})

# Data describing
data.describe(include='all')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/cyberbullying_tweets.csv


Unnamed: 0,tweet_text,cyberbullying_type
count,47692,47692
unique,46017,2
top,RT @sailorhg: the intro for my hardware hackin...,True
freq,2,39747


In [None]:
# Data splitting
text_features = "tweet_text"
target_feature = "cyberbullying_type"

train_data, test_data = train_test_split(data, test_size=0.2, random_state=123)

x_train = train_data.drop(columns=target_feature)
y_train = train_data["cyberbullying_type"]

x_test = test_data.drop(columns=target_feature)
y_test = test_data["cyberbullying_type"]

### Text Processing
Define TF-IDF to pre-process the text data into vector representations

In [None]:
# TF-IDF definition
TFIDF = TfidfVectorizer(lowercase = False)

### Model Definition
Defining transformers for text pre-processing and model pipelines

In [None]:
# TF-IDF pre-processor transformer
TFIDF_preprocessor = ColumnTransformer(
    transformers=[
        ('text', TFIDF, text_features)
    ],
    remainder='passthrough'
)

In [None]:
# Model pipelines

# LinearSVM + TF-IDF pipeline
linear_svm_tfidf = Pipeline(
    steps = [
        ("transformer", TFIDF_preprocessor),
        ("linearsvm", LinearSVC(random_state=123))
    ]
)

# LR + TF-IDF pipeline
lr_tfidf = Pipeline(
    steps = [
        ("transformer", TFIDF_preprocessor),
        ("lr", LogisticRegression(random_state=123))
    ]
)

### Hyperparameter Settings
Hyperparameters for Logistic Regression (LR) and LinearSVM classification models

In [None]:
CV = 5

svm_param_grid = {
    'svm__C': [1, 10, 100, 1000],
    'svm__gamma': [1, 0.1, 0.001, 0.0001],
    'svm__kernel': ['linear', 'rbf', 'poly'],
}

linear_svm_param_grid = {
    'linearsvm__C': [0.001, 0.01, 1, 10, 100],
    'linearsvm__max_iter': [10000, 20000, 50000],
    'linearsvm__tol': [1e-2, 1e-4, 1e-6],
}

lr_param_grid = {
    'lr__C': [0.1, 1, 10, 100],
    'lr__penalty': [None, 'l1', 'l2', 'elasticnet'],
    'lr__max_iter': [250, 500, 1000],
    'lr__fit_intercept': [True, False],
    'lr__solver': ['lbfgs', 'liblinear', 'newton-cg']
}

### LinearSVM Hyperparameter Tuning
Hyperparameter tuning using GridSearchCV for `linear_svm_tfidf`

In [None]:
linear_svm_tfidf_grid = GridSearchCV(linear_svm_tfidf, linear_svm_param_grid, cv=CV, n_jobs=-1, verbose=2)
linear_svm_tfidf_grid.fit(x_train, y_train)
linear_svm_tfidf_results = pd.DataFrame(linear_svm_tfidf_grid.cv_results_)
linear_svm_tfidf_results.to_csv('linear_svm_tfidf_results.csv', index=False)

sorted_linear_svm_tfidf_results = linear_svm_tfidf_results.sort_values(by=["rank_test_score", "mean_fit_time"], ascending=[True, True])
display(sorted_linear_svm_tfidf_results)
linear_svm_tfidf_best_params = sorted_linear_svm_tfidf_results.iloc[0]["params"]
print(f"Best Params: {linear_svm_tfidf_best_params}")

### Logistic Regression (LR) Hyperparameter Tuning
Hyperparameter tuning using GridSearchCV for `lr_tfidf`

In [None]:
lr_tfidf_grid = GridSearchCV(lr_tfidf, lr_param_grid, cv=CV, n_jobs=-1, verbose=2)
lr_tfidf_grid.fit(x_train, y_train)
lr_tfidf_results = pd.DataFrame(lr_tfidf_grid.cv_results_)
lr_tfidf_results.to_csv('lr_tfidf_results.csv', index=False)

sorted_lr_tfidf_results = lr_tfidf_results.sort_values(by=["rank_test_score", "mean_fit_time"], ascending=[True, True])
display(sorted_lr_tfidf_results)
lr_tfidf_best_params = sorted_lr_tfidf_results.iloc[0]["params"]
print(f"Best Params: {lr_tfidf_best_params}")

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


720 fits failed out of a total of 1440.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lr__C,param_lr__fit_intercept,param_lr__max_iter,param_lr__penalty,param_lr__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
100,1.757987,0.045110,0.310129,0.007873,1.0,True,1000,l1,liblinear,"{'lr__C': 1, 'lr__fit_intercept': True, 'lr__m...",0.868562,0.871183,0.873804,0.879161,0.870380,0.872618,0.003681,1
76,2.163181,0.568652,0.455880,0.123539,1.0,True,250,l1,liblinear,"{'lr__C': 1, 'lr__fit_intercept': True, 'lr__m...",0.868562,0.871183,0.873804,0.879161,0.870380,0.872618,0.003681,1
88,2.574565,0.519766,0.411719,0.134446,1.0,True,500,l1,liblinear,"{'lr__C': 1, 'lr__fit_intercept': True, 'lr__m...",0.868562,0.871183,0.873804,0.879161,0.870380,0.872618,0.003681,1
124,1.700069,0.045984,0.308211,0.005005,1.0,False,500,l1,liblinear,"{'lr__C': 1, 'lr__fit_intercept': False, 'lr__...",0.863059,0.865548,0.870266,0.878244,0.866186,0.868661,0.005322,4
112,1.721793,0.018041,0.289685,0.012301,1.0,False,250,l1,liblinear,"{'lr__C': 1, 'lr__fit_intercept': False, 'lr__...",0.863059,0.865548,0.870266,0.878244,0.866186,0.868661,0.005322,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,2.540930,0.235006,0.000000,0.000000,0.1,False,500,l1,newton-cg,"{'lr__C': 0.1, 'lr__fit_intercept': False, 'lr...",,,,,,,,145
239,2.543241,0.183012,0.000000,0.000000,100.0,True,500,elasticnet,newton-cg,"{'lr__C': 100, 'lr__fit_intercept': True, 'lr_...",,,,,,,,145
227,2.560418,0.187894,0.000000,0.000000,100.0,True,250,elasticnet,newton-cg,"{'lr__C': 100, 'lr__fit_intercept': True, 'lr_...",,,,,,,,145
59,2.633424,0.434459,0.000000,0.000000,0.1,False,500,elasticnet,newton-cg,"{'lr__C': 0.1, 'lr__fit_intercept': False, 'lr...",,,,,,,,145


Best Params: {'lr__C': 1, 'lr__fit_intercept': True, 'lr__max_iter': 1000, 'lr__penalty': 'l1', 'lr__solver': 'liblinear'}


### Best Models
Best parameters for the LinearSVM and LR models

In [None]:
# Best LinearSVM
linear_svm_tfidf.set_params(**linear_svm_tfidf_best_params)
linear_svm_tfidf.fit(x_train, y_train)

In [None]:
# Best LR
lr_tfidf.set_params(**lr_tfidf_best_params)
lr_tfidf.fit(x_train, y_train)