In [13]:
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.metrics import roc_auc_score

# Load the dataset
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialise the tokeniser
tokeniser = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

# Dataset class
class ToxicCommentsDataset(Dataset):
    def __init__(self, dataframe, tokeniser, max_len=512):
        self.tokenizer = tokeniser
        self.texts = dataframe['comment_text'].tolist()
        self.labels = dataframe.iloc[:, 2:].values  # Assuming labels start from the 3rd column
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokeniser.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.FloatTensor(self.labels[idx])
        }

max_len = 256
batch_size = 8

train_dataset = ToxicCommentsDataset(train_df, tokeniser, max_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = ToxicCommentsDataset(test_df, tokeniser, max_len)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=6).to(device)

optimiser = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimiser, num_warmup_steps=0, num_training_steps=num_training_steps)

checkpoint_path = 'deberta_checkpoint.pth'
start_epoch = 0
start_step = 0

# Check if checkpoint exists
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimiser.load_state_dict(checkpoint['optimiser_state_dict'])
    start_epoch = checkpoint['epoch']
    start_step = checkpoint['step']
    print(f"Resuming training from epoch {start_epoch}, step {start_step}")
else:
    print("No checkpoint found, starting training from scratch.")

# Function to evaluate the model
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions.append(logits.detach().cpu().numpy())
            true_labels.append(batch['labels'].detach().cpu().numpy())
    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    # Compute ROC-AUC for each label
    roc_auc_scores = {f"Label {i}": roc_auc_score(true_labels[:, i], predictions[:, i]) for i in range(true_labels.shape[1])}
    roc_auc_scores['Average'] = np.mean(list(roc_auc_scores.values()))
    return roc_auc_scores

evaluation_results = []  # To store evaluation results for each epoch

for epoch in range(start_epoch, num_epochs):
    model.train()
    for step, batch in enumerate(train_loader, start=1):
        if step < start_step:  # Skip steps until reaching the last saved step
            continue
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimiser.step()
        scheduler.step()
        optimiser.zero_grad()

        if step % 100 == 0:
            # Save checkpoint
            checkpoint = {
                'model_state_dict': model.state_dict(),
                'optimiser_state_dict': optimiser.state_dict(),
                'epoch': epoch,
                'step': step
            }
            torch.save(checkpoint, checkpoint_path)
            print(f"Checkpoint saved at epoch {epoch}, step {step}")

    start_step = 0  # Reset start step for the next epochs
    # Evaluation after each epoch
    roc_auc_scores = evaluate(model, test_loader, device)
    evaluation_results.append(roc_auc_scores)
    print(f"Epoch {epoch} Evaluation Results: {roc_auc_scores}")

# Save evaluation results to file
evaluation_results_df = pd.DataFrame(evaluation_results)
evaluation_results_df.to_csv('evaluation_results.csv', index=False)
print("Evaluation results saved to file.")
