### Process Overview
1. Import the dataset
2. Split dataset into training and testing (no need for a validation dataset because we will be using cross-validation)
3. Process the text data using TF-IDF
4. Train model using LR or SVM
5. Perform k-fold cross validation for hyperparameter tuning
6. Test each trained model
7. Plot confusion matrices & metrics (accuracy, precision, recall, f1-score)

### Imports
Include imports needed for the process

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### Dataset
Import the dataset, describe the dataset, and data splitting

In [None]:
from google.colab import drive

drive.mount('/content/drive/')

# Data import
data = pd.read_csv('/content/drive/MyDrive/cyberbullying_tweets.csv')
!ls /content/drive/MyDrive/cyberbullying_tweets.csv

# Convert cyberbullying_type column to be True or False
def convertToBullyClassification(cyberbullying_type):
  return cyberbullying_type != "not_cyberbullying"

data["cyberbullying_type"] = data["cyberbullying_type"].apply(convertToBullyClassification)
data.rename(columns={"cyberbullying_type":"is_cyberbullying"})

# Data describing
data.describe(include='all')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/cyberbullying_tweets.csv


Unnamed: 0,tweet_text,cyberbullying_type
count,47692,47692
unique,46017,2
top,RT @sailorhg: the intro for my hardware hackin...,True
freq,2,39747


In [None]:
# Data splitting
text_features = "tweet_text"
target_feature = "cyberbullying_type"

train_data, test_data = train_test_split(data, test_size=0.2, random_state=123)

x_train = train_data.drop(columns=target_feature)
y_train = train_data["cyberbullying_type"]

x_test = test_data.drop(columns=target_feature)
y_test = test_data["cyberbullying_type"]

### Text Processing
Define TF-IDF to pre-process the text data into vector representations

In [None]:
# TF-IDF definition
TFIDF = TfidfVectorizer(lowercase = False)

### Model Definition
Defining transformers for text pre-processing and model pipelines

In [None]:
# TF-IDF pre-processor transformer
TFIDF_preprocessor = ColumnTransformer(
    transformers=[
        ('text', TFIDF, text_features)
    ],
    remainder='passthrough'
)

In [None]:
# Model pipelines

# LinearSVM + TF-IDF pipeline
linear_svm_tfidf = Pipeline(
    steps = [
        ("transformer", TFIDF_preprocessor),
        ("linearsvm", LinearSVC(random_state=123))
    ]
)

NameError: name 'TFIDF_preprocessor' is not defined

In [None]:
# LR + TF-IDF pipeline
lr_tfidf = Pipeline(
    steps = [
        ("transformer", TFIDF_preprocessor),
        ("lr", LogisticRegression(random_state=123))
    ]
)

### Hyperparameter Settings
Hyperparameters for Logistic Regression (LR) and LinearSVM classification models

In [None]:
CV = 5

svm_param_grid = {
    'svm__C': [1, 10, 100, 1000],
    'svm__gamma': [1, 0.1, 0.001, 0.0001],
    'svm__kernel': ['linear', 'rbf', 'poly'],
}

linear_svm_param_grid = {
    'linearsvm__C': [0.001, 0.01, 1, 10, 100],
    'linearsvm__max_iter': [10000, 20000, 50000],
    'linearsvm__tol': [1e-2, 1e-4, 1e-6],
}

lr_param_grid = {
    'lr__C': [0.1, 1, 10, 100],
    'lr__penalty': [None, 'l1', 'l2', 'elasticnet'],
    'lr__max_iter': [250, 500, 1000],
    'lr__fit_intercept': [True, False],
    'lr__solver': ['lbfgs', 'liblinear', 'sag', 'saga', 'newton-cg']
}

# lr_param_grid = [
#     {
#     'lr__C': [0.1, 1, 10, 100],
#     'lr__penalty': ['l2'],
#     'lr__max_iter': [250, 500, 1000],
#     'lr__fit_intercept': [True, False],
#     'lr__solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag']
#     },
#     {
#     'lr__C': [0.1, 1, 10, 100],
#     'lr__penalty': ['l1', 'l2'],
#     'lr__max_iter': [250, 500, 1000],
#     'lr__fit_intercept': [True, False],
#     'lr__solver': ['liblinear']
#     },
#     {
#     'lr__C': [0.1, 1, 10, 100],
#     'lr__penalty': ['l1', 'l2', 'elasticnet'],
#     'lr__max_iter': [250, 500, 1000],
#     'lr__fit_intercept': [True, False],
#     'lr__solver': ['saga']
#     },
#     {
#     'lr__penalty': [None],
#     'lr__max_iter': [250, 500, 1000],
#     'lr__fit_intercept': [True, False],
#     'lr__solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
#     }
# ]

### LinearSVM Hyperparameter Tuning
Hyperparameter tuning using GridSearchCV for `linear_svm_tfidf`

In [None]:
linear_svm_tfidf_grid = GridSearchCV(linear_svm_tfidf, linear_svm_param_grid, cv=CV, n_jobs=-1, verbose=2)
linear_svm_tfidf_grid.fit(x_train, y_train)
linear_svm_tfidf_results = pd.DataFrame(linear_svm_tfidf_grid.cv_results_)
linear_svm_tfidf_results.to_csv('linear_svm_tfidf_results.csv', index=False)

sorted_linear_svm_tfidf_results = linear_svm_tfidf_results.sort_values(by=["rank_test_score", "mean_fit_time"], ascending=[True, True])
display(sorted_linear_svm_tfidf_results)
linear_svm_tfidf_best_params = sorted_linear_svm_tfidf_results.iloc[0]["params"]
print(f"Best Params: {linear_svm_tfidf_best_params}")

Fitting 5 folds for each of 45 candidates, totalling 225 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_linearsvm__C,param_linearsvm__max_iter,param_linearsvm__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,1.751401,0.008362,0.312447,0.005909,1.0,10000,0.0001,"{'linearsvm__C': 1, 'linearsvm__max_iter': 100...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
22,2.073536,0.358621,0.353325,0.081272,1.0,20000,0.0001,"{'linearsvm__C': 1, 'linearsvm__max_iter': 200...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
20,2.602577,0.731081,0.499484,0.122373,1.0,10000,1e-06,"{'linearsvm__C': 1, 'linearsvm__max_iter': 100...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
26,2.743551,0.821875,0.408402,0.126512,1.0,50000,1e-06,"{'linearsvm__C': 1, 'linearsvm__max_iter': 500...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
23,2.887452,0.582603,0.398874,0.105958,1.0,20000,1e-06,"{'linearsvm__C': 1, 'linearsvm__max_iter': 200...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
25,3.020898,0.842366,0.538113,0.123892,1.0,50000,0.0001,"{'linearsvm__C': 1, 'linearsvm__max_iter': 500...",0.854934,0.85441,0.858603,0.863565,0.856225,0.857547,0.003339,1
24,1.811504,0.074369,0.322511,0.006742,1.0,50000,0.01,"{'linearsvm__C': 1, 'linearsvm__max_iter': 500...",0.854934,0.85441,0.858603,0.863565,0.856094,0.857521,0.00335,7
21,2.035722,0.394467,0.327289,0.00825,1.0,20000,0.01,"{'linearsvm__C': 1, 'linearsvm__max_iter': 200...",0.854934,0.85441,0.858603,0.863565,0.856094,0.857521,0.00335,7
18,2.759435,0.538657,0.415946,0.120993,1.0,10000,0.01,"{'linearsvm__C': 1, 'linearsvm__max_iter': 100...",0.854934,0.85441,0.858603,0.863565,0.856094,0.857521,0.00335,7
9,1.592302,0.036362,0.320861,0.009813,0.01,10000,0.01,"{'linearsvm__C': 0.01, 'linearsvm__max_iter': ...",0.842747,0.842485,0.840257,0.844954,0.843906,0.84287,0.001575,10


Best Params: {'linearsvm__C': 1, 'linearsvm__max_iter': 10000, 'linearsvm__tol': 0.0001}


### Logistic Regression (LR) Hyperparameter Tuning
Hyperparameter tuning using GridSearchCV for `lr_tfidf`

In [None]:
lr_tfidf_grid = GridSearchCV(lr_tfidf, lr_param_grid, cv=CV, n_jobs=1, verbose=2)
lr_tfidf_grid.fit(x_train, y_train)
lr_tfidf_results = pd.DataFrame(lr_tfidf_grid.cv_results_)
lr_tfidf_results.to_csv('lr_tfidf_results.csv', index=False)

sorted_lr_tfidf_results = lr_tfidf_results.sort_values(by=["rank_test_score", "mean_fit_time"], ascending=[True, True])
display(sorted_lr_tfidf_results)
lr_tfidf_best_params = sorted_lr_tfidf_results.iloc[0]["params"]
print(f"Best Params: {lr_tfidf_best_params}")

NameError: name 'GridSearchCV' is not defined

### Best Models
Best parameters for the LinearSVM and LR models

In [None]:
# Best LinearSVM
linear_svm_tfidf.set_params(**linear_svm_tfidf_best_params)
linear_svm_tfidf.fit(x_train, y_train)

In [None]:
# Best LR
lr_tfidf.set_params(**lr_tfidf_best_params)
lr_tfidf.fit(x_train, y_train)

### Confusion Matrices and Scores

Confusion matrix plots and metric scores for all models

In [None]:
# Best Models
models = [linear_svm_tfidf, lr_tfidf]

# Model Names
model_names = [
    "LinearSVC - TFIDF",
    "Logistic Regression - TFIDF"
]

# Scores
scores = {}

fig, axes = plt.subplots(4, 1, figsize=(10, 10))

for ax, model, name in zip(axes, models, model_names):
  ax.set_title(name)

  matrix = ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, ax=ax, colorbar=False)

  arr = matrix.confusion_matrix

  TP = arr[0, 0]
  FP = arr[1, 0]
  FN = arr[0, 1]
  TN = arr[1, 1]

  scores[name] = {}

  scores[name]["Accuracy"] = (TP + TN) / arr.sum()
  scores[name]["Precision"] = TP / (TP + FP)
  scores[name]["Recall"] = TP / (TP + FN)
  scores[name]["F1-Score"] = 2 * (scores[name]["Precision"] * scores[name]["Recall"]) / (scores[name]["Precision"] + scores[name]["Recall"])

plt.tight_layout()
plt.show()
display(pd.DataFrame(scores).T)