Importing necessary libraries

In [2]:
import os
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from distilbert_model import DistilBertModel
from bert_model import BertModel
from roberta_model import RobertaModel
import torch
from torch.utils.data import Dataset, DataLoader
from .autonotebook import tqdm
from plot import plot_roc_curve
from ToxicCommentsDataset import ToxicCommentsDataset


RuntimeError: Failed to import transformers.models.distilbert.tokenization_distilbert_fast because of the following error (look up to see its traceback):
No module named 'Distilbert'

Defining Global variables

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
label_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
model_dir = ''

Selecting the model, please select the model from the following list:
1. DistilBERT
2. BERT
3. RoBERTa

In [None]:
selection = input("Please select the model from the following list: \n1. DistilBERT\n2. BERT\n3. RoBERTa\n")
if selection == '1':
    model = DistilBertModel(num_labels=6)
    model_dir = "./DistilbertModel"
elif selection == '2':
    model = BertModel(num_labels=6)
    model_dir = "./BertModel"
elif selection == '3':
    model = RobertaModel(num_labels=6)
    model_dir = "./RobertaModel"
else:
    print("Defaulting to DistilBERT")
    model = DistilBertModel(num_labels=6)
    model_dir = "./DistilbertModel"

Loading and pre-processing the train dataset. Pre-processing is done within the dataset class, defined in the train function of the models.

In [None]:
dataset_path = "data/input/train.csv"

df = pd.read_csv(dataset_path)
train_df, val_df = train_test_split(df, test_size=0.1)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [None]:
def clean_text(self, text):
    """Custom text cleaning function."""
    # Replace URLs
    text = re.sub(r'http\S+', '[URL]', text)
    # Replace usernames and timestamps
    text = re.sub(r'\(talk\)|\d{2}:\d{2}, \w+ \d{1,2}, \d{4} \(UTC\)', '[META]', text)
    # Normalize new lines and excessive white spaces
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = re.sub(' +', ' ', text)
    return text

Training the model

In [None]:
if os.listdir(model_dir):
    model.load(model_dir)
    print("!!!!!!!!!Model loaded!!!!!!!!")
else:
    model.train(train_df, val_df)
    model.save(model_dir)

Loading and pre-processing the test dataset

In [None]:
test_dataset_path = "data/input/test.csv"
df_test = pd.read_csv(test_dataset_path)
df_test['comment_text'] = df_test['comment_text'].fillna(" ").str.lower()

# Load the test labels
test_labels_path = "data/input/test_labels.csv"
df_test_labels = pd.read_csv(test_labels_path)

# Merge the test dataset with its labels
df_test = df_test.merge(df_test_labels, on='id')

# Filter out any rows where the labels might be missing or marked as '-1' (if applicable)
df_test = df_test[df_test.toxic != -1]
df_test = df_test.reset_index(drop=True)

test_dataset = ToxicCommentsDataset(df_test, model.tokenizer, max_len=128)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


Getting Test Predictions and True Label data

In [None]:
def get_test_predictions_and_labels(distilbert_model_instance, data_loader):
    distilbert_model_instance.model.eval()
    predictions = []
    true_labels = []
    
    progress_bar = tqdm(range(len(data_loader)), desc="Evaluating")
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
            }
            labels = batch['labels'].to(device)
            outputs = distilbert_model_instance.model(**inputs)
            logits = outputs.logits
            predictions.append(logits.detach().cpu().numpy())
            true_labels.append(labels.cpu().numpy())

            progress_bar.update(1)
    progress_bar.close()

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    return predictions, true_labels

# Get predictions and true labels
predictions, true_labels = get_test_predictions_and_labels(model, test_loader)

# Apply a sigmoid function to the predictions since we're dealing with multi-label classification
sigmoid = torch.nn.Sigmoid()
probabilities = sigmoid(torch.tensor(predictions)).numpy()

Creating predictions.csv

In [None]:
# Convert probabilities to binary predictions using a threshold (e.g., 0.5)
#predicted_labels = (probabilities > 0.5).astype(int)
predicted_labels = probabilities 
submission_df = pd.DataFrame(predicted_labels, columns =label_names)

# Add the 'id' column from the test DataFrame
submission_df.insert(0, 'id', df_test['id'].values)

# Write the DataFrame to a CSV file
submission_df.to_csv('data/input/predictions.csv', index=False)

Calculating the metrics

In [None]:
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')
print(np.shape(true_labels))
roc_aucs = plot_roc_curve(true_labels, probabilities, label_names)

# Compute the mean column-wise ROC AUC
mean_roc_auc = np.mean(roc_aucs)

print(f"Mean column-wise ROC AUC: {mean_roc_auc}")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")