In [1]:
# load on GPU
import os
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, WeightedRandomSampler
from tqdm import tqdm
from transformers import BertModel, BertTokenizer

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cpu


In [3]:
def load_csv(filepath: str) -> pd.DataFrame:
    """
    Loads the given CSV file into a DataFrame.

    Args:
        filepath (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    try:
        data = pd.read_csv(filepath, engine="python")
        print(f"Data successfully loaded from {filepath}")
        return data
    except Exception as e:
        print(f"Error while reading the CSV file: {e}")
        return None

In [4]:
data = load_csv(
    r"C:\Users\richm\OneDrive\Desktop\DSA4264\DSA4264-Detoxify\balanced_data.csv"
)
data

Data successfully loaded from C:\Users\richm\OneDrive\Desktop\DSA4264\DSA4264-Detoxify\clean_concatenated_texts_data_with_labels.csv


Unnamed: 0.1,Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation,...,Unnamed: 82,Unnamed: 83,Unnamed: 84,Unnamed: 85,Unnamed: 86,Unnamed: 87,Unnamed: 88,Unnamed: 89,Unnamed: 90,Unnamed: 91
0,0,id of j2gwp5d said this: The best.,1/1/2023 4:51,MisoMesoMilo,/r/singapore/comments/1004s1o/rsingapore_rando...,t3_1004s1o,t1_j2gje0f,j2gwp5d,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",...,,,,,,,,,,
1,1,id of j2i2mtw said this: Wah heart pain.\nid o...,1/1/2023 14:03,MisoMesoMilo,/r/singapore/comments/1004s1o/rsingapore_rando...,t3_1004s1o,t1_j2hyfm2,j2i2mtw,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",...,,,,,,,,,,
2,2,id of j2h2alh said this: HNY DT where to go to...,1/1/2023 5:48,dazark,/r/singapore/comments/1004s1o/rsingapore_rando...,t3_1004s1o,t3_1004s1o,j2h2alh,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",...,,,,,,,,,,
3,3,"id of j2hpea5 said this: Clubbing shenanigans,...",1/1/2023 11:08,N1_Procrastinator,/r/singapore/comments/1004s1o/rsingapore_rando...,t3_1004s1o,t1_j2ho0m4,j2hpea5,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",...,,,,,,,,,,
4,4,id of j2hewei said this: Oooff...was supposed ...,1/1/2023 8:28,EaeleButEeelier,/r/singapore/comments/1004s1o/rsingapore_rando...,t3_1004s1o,t3_1004s1o,j2hewei,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,1048555,id of fw5t8zt said this: Tharman will hard car...,6/27/2020 12:32,wank_for_peace,/r/singapore/comments/hgnndv/ivan_responds/fw5...,t3_hgnndv,t1_fw55rd0,fw5t8zt,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",...,,,,,,,,,,
1048571,1048556,id of fw5djn0 said this: Ivan whole day long p...,6/27/2020 8:16,show-up,/r/singapore/comments/hgnndv/ivan_responds/fw5...,t3_hgnndv,t1_fw52891,fw5djn0,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",...,,,,,,,,,,
1048572,1048557,id of fw5buyl said this: He is saying people a...,6/27/2020 7:47,CanISmellYourPanty,/r/singapore/comments/hgnndv/ivan_responds/fw5...,t3_hgnndv,t1_fw56nrv,fw5buyl,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",...,,,,,,,,,,
1048573,1048558,id of fw54fmr said this: Eh you want say can s...,6/27/2020 5:52,thrulim123,/r/singapore/comments/hgnndv/ivan_responds/fw5...,t3_hgnndv,t1_fw52891,fw54fmr,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",...,,,,,,,,,,


In [6]:
# Hyperparameters for model training.
MAX_LEN = 200
num_classes = 7
TRAIN_BATCH_SIZE = 128
VALID_BATCH_SIZE = 128
TEST_BATCH_SIZE = 128
EPOCHS = 3
LEARNING_RATE = 1e-05

In [7]:
# Update class-to-id mapping for 7 classes
cls_to_id = {
    "No Hate/Toxic": 0,
    "Hate 1": 1,
    "Hate 2": 2,
    "Hate 3": 3,
    "Toxic 1": 4,
    "Toxic 2": 5,
    "Toxic 3": 6,
}

id_to_cls = {v: k for k, v in cls_to_id.items()}

num_classes = 7  # Update the number of classes

In [8]:
# Convert labels to integer format
data["gold_label"] = data["gold_label"].map(cls_to_id)
data = data.dropna(subset=["gold_label"])
data["gold_label"] = data["gold_label"].astype(int)

# Step 1: Split the data into 70% train and 30% (validation + test)
df_train, df_temp = train_test_split(
    data,
    test_size=0.3,  # 30% for validation and test
    shuffle=True,  # Shuffle only during the first split
    stratify=data["gold_label"],  # Maintain class distribution
    random_state=42,
)

# Step 2: Split the remaining 30% into 15% validation and 15% test
# NOTE: Now stratifying using the labels from df_temp, not data
df_valid, df_test = train_test_split(
    df_temp,
    test_size=0.5,  # Split 50-50 between validation and test from the remaining 30%
    shuffle=True,
    stratify=df_temp["gold_label"],  # Correct stratification
    random_state=42,
)

# Display the sizes of each set
print(f"Training set size: {len(df_train)}")
print(f"Validation set size: {len(df_valid)}")
print(f"Test set size: {len(df_test)}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['gold_label'] = filtered_df['gold_label'].map(cls_to_id)


In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [10]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.texts = df["text"].values
        self.targets = df["gold_label"].values
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())  # Clean text

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "token_type_ids": inputs["token_type_ids"].flatten(),
            "targets": torch.tensor(self.targets[index], dtype=torch.long),
            "text": text,
        }

In [None]:
# Assuming you have 10,000 samples per class and 7 classes
num_samples = len(df_train)
num_classes = len(np.unique(df_train["gold_label"]))
class_counts = np.bincount(df_train["gold_label"])
print(class_counts)

In [None]:
# Calculate class weights (inverse of class frequency)
class_weights = 1.0 / class_counts
sample_weights = class_weights[df_train["gold_label"]]
print(sample_weights)

In [11]:
# Create WeightedRandomSampler for balanced class sampling for the training dataset
train_sampler = WeightedRandomSampler(
    weights=sample_weights,  # You need to define how to calculate these sample weights
    num_samples=num_samples,
    replacement=False,
)

# Create datasets for train, validation, and test sets
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN)  # Add the test dataset

# Data loaders
train_data_loader = DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    sampler=train_sampler,  # Use the weighted sampler here
    num_workers=4,  # Increase num_workers for faster loading
)

val_data_loader = DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,  # No need to shuffle validation data
    num_workers=4,
)

test_data_loader = DataLoader(
    test_dataset,
    batch_size=VALID_BATCH_SIZE,  # Typically use the same batch size as validation
    shuffle=False,  # No need to shuffle test data
    num_workers=4,
)

# Check the length of the datasets
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")  # Check the size of the test dataset

# Check the number of batches in the DataLoader
print(f"Number of batches in training DataLoader: {len(train_data_loader)}")
print(f"Number of batches in validation DataLoader: {len(val_data_loader)}")
print(
    f"Number of batches in test DataLoader: {len(test_data_loader)}"
)  # Check the number of batches for the test set

In [12]:
class BERTClass(nn.Module):
    def __init__(self, num_classes):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(
            "bert-base-uncased", return_dict=True
        )

        # Additional fully connected and dropout layers
        self.fc1 = nn.Linear(768, 512)
        self.dropout1 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(0.2)

        # Final output layer for classification
        self.linear = nn.Linear(256, num_classes)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids
        )

        x = self.fc1(output.pooler_output)
        x = nn.ReLU()(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = nn.ReLU()(x)
        x = self.dropout2(x)

        output = self.linear(x)
        return output

In [None]:
# Calculate class weights based on the balanced data
class_counts = data["gold_label"].value_counts().sort_index()
class_weights = 1.0 / class_counts  # Inverse of class frequency
class_weights = class_weights / class_weights.sum()  # Normalize the weights

# Convert to a tensor and move to the correct device (GPU)
class_weights_tensor = torch.tensor(class_weights.values, dtype=torch.float).to(device)

# Use class weights in CrossEntropyLoss
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

In [None]:
# Initialize the model
model = BERTClass(num_classes=num_classes)

# Move the model to the appropriate device (use all available GPUs)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss function (already set with class weights) and optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.1)

# Training loop
for epoch in range(EPOCHS):
    start_time = time.time()  # Record the start time of the epoch

    model.train()  # Set model to training mode
    total_loss = 0.0
    correct_predictions = 0
    all_labels = []
    all_preds = []

    # Create a progress bar for the training data loader
    train_loader_tqdm = tqdm(
        train_data_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", unit="batch"
    )

    # Training step with progress bar
    for batch in train_loader_tqdm:
        ids = batch["input_ids"].to(device, dtype=torch.long)
        mask = batch["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
        labels = batch["targets"].to(device, dtype=torch.long)

        optimizer.zero_grad()  # Clear previous gradients

        # Forward pass
        logits = model(ids, mask, token_type_ids)
        loss = criterion(logits, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calculate accuracy
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels).item()

        # Collect labels and predictions for precision, recall, and F1 calculation
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

        # Update the progress bar with loss
        train_loader_tqdm.set_postfix(
            {"train_loss": total_loss / (train_loader_tqdm.n + 1)}
        )

    # Calculate average loss and accuracy for this epoch
    avg_loss = total_loss / len(train_data_loader)
    accuracy = correct_predictions / len(train_data_loader.dataset)

    # Calculate precision, recall, and F1 for training
    precision = precision_score(all_labels, all_preds, average="weighted")
    recall = recall_score(all_labels, all_preds, average="weighted")
    f1 = f1_score(all_labels, all_preds, average="weighted")

    # Record the end time and calculate duration
    end_time = time.time()
    epoch_duration = end_time - start_time  # Time taken for this epoch

    print(f"Train Loss: {avg_loss:.4f}, Train Accuracy: {accuracy:.4f}")
    print(
        f"Train Precision: {precision:.4f}, Train Recall: {recall:.4f}, Train F1-Score: {f1:.4f}"
    )
    print(f"Epoch Duration: {epoch_duration:.2f} seconds")

    # Validation step with progress bar
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_labels = []
    val_preds = []

    # Create a progress bar for the validation data loader
    val_loader_tqdm = tqdm(val_data_loader, desc="Validating", unit="batch")

    with torch.no_grad():
        for batch in val_loader_tqdm:
            ids = batch["input_ids"].to(device, dtype=torch.long)
            mask = batch["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
            labels = batch["targets"].to(device, dtype=torch.long)

            logits = model(ids, mask, token_type_ids)
            loss = criterion(logits, labels)
            val_loss += loss.item()

            _, preds = torch.max(logits, dim=1)
            val_correct += torch.sum(preds == labels).item()

            # Collect validation labels and predictions for metrics
            val_labels.extend(labels.cpu().numpy())
            val_preds.extend(preds.cpu().numpy())

            # Update the progress bar with validation loss
            val_loader_tqdm.set_postfix(
                {"val_loss": val_loss / (val_loader_tqdm.n + 1)}
            )

    avg_val_loss = val_loss / len(val_data_loader)
    val_accuracy = val_correct / len(val_data_loader.dataset)

    # Calculate precision, recall, and F1 for validation
    val_precision = precision_score(val_labels, val_preds, average="weighted")
    val_recall = recall_score(val_labels, val_preds, average="weighted")
    val_f1 = f1_score(val_labels, val_preds, average="weighted")

    print(
        f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}"
    )
    print(
        f"Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}, Validation F1-Score: {val_f1:.4f}"
    )
    print("-" * 30)

Epoch 1/20:   0%|          | 0/900 [00:00<?, ?batch/s]

In [None]:
training_number = 4

# Save the trained model weights
model_save_path = f"bert_model_{training_number}.pth"
torch.save(model.state_dict(), model_save_path)

# Initialize the model architecture (exactly the same as when the model was trained)
model = BERTClass(num_classes=num_classes)

# Move the model to the appropriate device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the saved model weights onto the correct device
model.load_state_dict(torch.load(model_save_path, map_location=device))

print("Model loaded successfully")

In [12]:
# Assuming you have a validation DataLoader


def get_predictions_and_labels(model, data_loader, device):
    model.to(device)
    model.eval()

    predictions = []
    true_labels = []
    texts = []
    probabilities = []

    # Wrap the data loader with tqdm to create a progress bar
    loader_tqdm = tqdm(data_loader, desc="Generating predictions", unit="batch")

    with torch.no_grad():
        for batch in loader_tqdm:  # Use tqdm wrapped data_loader
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["targets"].to(device)
            text_batch = batch["text"]  # Collecting texts

            outputs = model(input_ids, attention_mask, token_type_ids)
            probs = nn.functional.softmax(outputs, dim=1)
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            texts.extend(text_batch)  # Collect texts
            probabilities.extend(probs.cpu().numpy())  # Store probabilities

    return np.array(predictions), np.array(true_labels), texts, np.array(probabilities)

In [None]:
# Generate predictions and labels for the test set
predictions, true_labels, texts, probabilities = get_predictions_and_labels(
    model, test_data_loader, device
)
# Extract class names from id_to_cls mapping
class_names = [id_to_cls[i] for i in range(num_classes)]
print(classification_report(true_labels, predictions, target_names=class_names))

In [None]:
# Function to calculate class-wise accuracy with tqdm


def calculate_class_wise_accuracy(conf_matrix):
    # True Positives for each class are the diagonal elements
    true_positives = np.diag(conf_matrix)

    # Support (Total actual instances for each class)
    support = conf_matrix.sum(axis=1)

    # Initialize list to store class-wise accuracy
    class_wise_accuracy = []

    # Use tqdm to show progress while calculating accuracy for each class
    for i in tqdm(
        range(len(true_positives)), desc="Calculating class-wise accuracy", unit="class"
    ):
        if support[i] != 0:  # Avoid division by zero
            accuracy = true_positives[i] / support[i]
        else:
            accuracy = 0.0
        class_wise_accuracy.append(accuracy)

    return np.array(class_wise_accuracy)

In [None]:
# Function to get misclassified samples with tqdm


def get_misclassified_samples(predictions, true_labels, texts):
    misclassified = []

    # Wrap the iteration with tqdm for a progress bar
    for pred, true, text in tqdm(
        zip(predictions, true_labels, texts),
        desc="Finding misclassified samples",
        total=len(predictions),
        unit="sample",
    ):
        if pred != true:
            misclassified.append(
                {"text": text, "true_label": true, "predicted_label": pred}
            )

    return misclassified

In [None]:
# Create confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)

# Calculate precision, recall, and F1-score (with zero_division handling)
precision = precision_score(
    true_labels, predictions, average="weighted", zero_division=0
)
recall = recall_score(true_labels, predictions, average="weighted", zero_division=0)
f1 = f1_score(true_labels, predictions, average="weighted", zero_division=0)

# Print the calculated metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")
print(f"F1-Score (Weighted): {f1:.4f}")

# Normalize confusion matrix
conf_matrix_normalized = (
    conf_matrix.astype("float") / conf_matrix.sum(axis=1)[:, np.newaxis]
)

# Set plot size and style
plt.figure(figsize=(12, 10))
sns.set(font_scale=1.2)  # Adjust font size

# Plot normalized confusion matrix with percentage formatting
ax = sns.heatmap(
    conf_matrix_normalized,
    annot=True,
    fmt=".2%",
    cmap="coolwarm",
    xticklabels=list(cls_to_id.keys()),
    yticklabels=list(cls_to_id.keys()),
)

# Ensure equal aspect ratio for x and y axes
ax.set_aspect("equal")

# Improve plot aesthetics
ax.set_xlabel("Predicted Labels", fontsize=14)
ax.set_ylabel("True Labels", fontsize=14)
ax.set_title("Normalized Confusion Matrix", fontsize=16)

# Adjust tick positions and spread them outwards
ax.xaxis.set_ticks_position("top")  # Move x-axis ticks to the top
ax.xaxis.set_label_position("top")  # Move x-axis label to the top

# Set xticks and yticks to center-align with matrix cells
ax.set_xticks([i + 0.5 for i in range(len(cls_to_id))])
ax.set_yticks([i + 0.5 for i in range(len(cls_to_id))])

# Apply label rotation to improve readability (optional)
plt.xticks(rotation=45, ha="center")
plt.yticks(rotation=0)

# Save and show the plot
plt.savefig("normalized_confusion_matrix.png", dpi=300)
plt.show()

In [None]:
# Get misclassified samples
misclassified_samples = get_misclassified_samples(predictions, true_labels, texts)

# Convert to DataFrame for better visualization
misclassified_df = pd.DataFrame(misclassified_samples)

misclassified_df["true_label"] = misclassified_df["true_label"].apply(
    lambda x: id_to_cls[x]
)
misclassified_df["predicted_label"] = misclassified_df["predicted_label"].apply(
    lambda x: id_to_cls[x]
)

# Display misclassified samples
print(misclassified_df.shape, len(predictions))
# misclassified_df.to_csv('./data/misclassified_df.csv', index=False)
misclassified_df.head(50)