In [1]:
import pickle

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from tqdm import tqdm

In [2]:
# Load the dataset
df = pd.read_csv(
    r"C:\Users\richm\OneDrive\Desktop\DSA4264\DSA4264-Detoxify\final_data.csv"
)  # Assuming your data is in 'final_data.csv'

In [3]:
# Preprocess the data
X = df["text"]
y = df["gold_label"]

In [4]:
# Convert text to numerical data (TF-IDF)
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X)

In [5]:
# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [6]:
# Train-Test Split with Stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [7]:
# Define your parameter grids
param_grid_logreg = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"],
}

param_grid_rf = {"n_estimators": [100, 200], "max_depth": [10, 20, None]}

param_grid_svm = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "poly"],
    "gamma": ["scale", "auto"],
}

# Initialize a dictionary to store models and their F1 scores
model_f1_scores = {}

In [8]:
# Logistic Regression
grid_search_logreg = GridSearchCV(
    LogisticRegression(max_iter=1000), param_grid_logreg, cv=3, n_jobs=-1
)
grid_search_logreg.fit(X_train, y_train)

In [9]:
# Get the best Logistic Regression model
best_logreg_model = grid_search_logreg.best_estimator_
y_pred_best_logreg = best_logreg_model.predict(X_test)
logreg_accuracy = accuracy_score(y_test, y_pred_best_logreg)
logreg_f1 = f1_score(y_test, y_pred_best_logreg, average="weighted")
model_f1_scores["Logistic Regression"] = logreg_f1
print("Logistic Regression Classification Report:")
print(
    classification_report(
        y_test, y_pred_best_logreg, target_names=label_encoder.classes_
    )
)

Logistic Regression Classification Report:
               precision    recall  f1-score   support

       Hate 1       0.95      0.92      0.93      2000
       Hate 2       0.96      0.96      0.96      2000
       Hate 3       1.00      0.99      0.99      2000
No Hate/Toxic       0.73      0.72      0.73      2000
      Toxic 1       0.72      0.72      0.72      2000
      Toxic 2       0.71      0.76      0.73      2000
      Toxic 3       0.99      0.99      0.99      2000

     accuracy                           0.86     14000
    macro avg       0.87      0.86      0.87     14000
 weighted avg       0.87      0.86      0.87     14000



In [10]:
# Random Forest
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

In [11]:
# Get the best Random Forest model
best_rf_model = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_best_rf)
rf_f1 = f1_score(y_test, y_pred_best_rf, average="weighted")
model_f1_scores["Random Forest"] = rf_f1
print("Random Forest Classification Report:")
print(
    classification_report(y_test, y_pred_best_rf, target_names=label_encoder.classes_)
)

Random Forest Classification Report:
               precision    recall  f1-score   support

       Hate 1       0.94      0.92      0.93      2000
       Hate 2       0.93      0.96      0.94      2000
       Hate 3       1.00      0.99      0.99      2000
No Hate/Toxic       0.73      0.72      0.73      2000
      Toxic 1       0.74      0.71      0.72      2000
      Toxic 2       0.69      0.75      0.72      2000
      Toxic 3       1.00      0.99      0.99      2000

     accuracy                           0.86     14000
    macro avg       0.86      0.86      0.86     14000
 weighted avg       0.86      0.86      0.86     14000



In [12]:
# # SVM
# grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=3, n_jobs=-1)
# grid_search_svm.fit(X_train, y_train)

In [13]:
# # Get the best SVM model
# best_svm_model = grid_search_svm.best_estimator_
# y_pred_best_svm = best_svm_model.predict(X_test)
# svm_accuracy = accuracy_score(y_test, y_pred_best_svm)
# svm_f1 = f1_score(y_test, y_pred_best_svm, average='weighted')
# model_f1_scores['SVM'] = svm_f1
# print("SVM Classification Report:")
# print(classification_report(y_test, y_pred_best_svm, target_names=label_encoder.classes_))

# SVM with default settings (no tuning)
svm_model = SVC()  # Default parameters for SVM
svm_model.fit(X_train, y_train)

# Get predictions for the test set
y_pred_best_svm = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, y_pred_best_svm)
svm_f1 = f1_score(y_test, y_pred_best_svm, average="weighted")
model_f1_scores["SVM"] = svm_f1

# Print the classification report for SVM
print("SVM Classification Report:")
print(
    classification_report(y_test, y_pred_best_svm, target_names=label_encoder.classes_)
)

SVM Classification Report:
               precision    recall  f1-score   support

       Hate 1       0.97      0.92      0.94      2000
       Hate 2       0.98      0.96      0.97      2000
       Hate 3       1.00      0.99      0.99      2000
No Hate/Toxic       0.75      0.73      0.74      2000
      Toxic 1       0.72      0.74      0.73      2000
      Toxic 2       0.71      0.77      0.74      2000
      Toxic 3       1.00      0.99      0.99      2000

     accuracy                           0.87     14000
    macro avg       0.88      0.87      0.87     14000
 weighted avg       0.88      0.87      0.87     14000



In [14]:
# Compare the models based on weighted F1-score and select the best one
best_model_name, best_f1_score = max(model_f1_scores.items(), key=lambda x: x[1])
print(
    f"\nThe best model based on weighted F1-score is: {best_model_name} with F1-score: {best_f1_score:.4f}"
)

# Save the best model and the TF-IDF vectorizer
if best_model_name == "Logistic Regression":
    best_model = best_logreg_model
elif best_model_name == "Random Forest":
    best_model = best_rf_model
else:
    best_model = svm_model

# Save the fine-tuned best model
model_save_path = r"C:\Users\richm\OneDrive\Desktop\DSA4264\DSA4264-Detoxify\model-1\classical_ml\model\model.pkl"
vectorizer_save_path = r"C:\Users\richm\OneDrive\Desktop\DSA4264\DSA4264-Detoxify\model-1\classical_ml\model\tfidf_vectorizer.pkl"
label_save_path = r"C:\Users\richm\OneDrive\Desktop\DSA4264\DSA4264-Detoxify\model-1\classical_ml\model\label_encoder.pkl"

with open(model_save_path, "wb") as model_file:
    pickle.dump(best_model, model_file)

with open(vectorizer_save_path, "wb") as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)

# After fitting the LabelEncoder
with open(label_save_path, "wb") as le_file:
    pickle.dump(label_encoder, le_file)

print(f"Best model '{best_model_name}' saved to {model_save_path}")
print(f"TF-IDF vectorizer saved to {vectorizer_save_path}")


The best model based on weighted F1-score is: SVM with F1-score: 0.8734
Best model 'SVM' saved to C:\Users\richm\OneDrive\Desktop\DSA4264\DSA4264-Detoxify\model-1\classical_ml\model\model.pkl
TF-IDF vectorizer saved to C:\Users\richm\OneDrive\Desktop\DSA4264\DSA4264-Detoxify\model-1\classical_ml\model\tfidf_vectorizer.pkl
