In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.metrics import roc_auc_score

# Load the dataset
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the tokenizer
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

# Dataset class
class ToxicCommentsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.texts = dataframe['comment_text'].tolist()
        self.labels = dataframe.iloc[:, 2:].values  # Assuming labels start from the 3rd column
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.FloatTensor(self.labels[idx])
        }

# Prepare datasets
max_len = 256  # You can adjust this
batch_size = 8  # Adjust based on your GPU memory

train_dataset = ToxicCommentsDataset(train_df, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = ToxicCommentsDataset(test_df, tokenizer, max_len)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=6).to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3  # Adjust as necessary
num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

checkpoint_path = 'deberta_checkpoint.pth'
start_epoch = 0
start_step = 0

# Check if checkpoint exists
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    start_step = checkpoint['step']
    print(f"Resuming training from epoch {start_epoch}, step {start_step}")
else:
    print("No checkpoint found, starting training from scratch.")

for epoch in range(start_epoch, num_epochs):
    model.train()
    for step, batch in enumerate(train_loader, start=1):
        if step < start_step:  # Skip steps until reaching the last saved step
            continue
        batch = {k: v.to('cuda') for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if step % 100 == 0:
            # Save checkpoint
            checkpoint = {
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch,
                'step': step
            }
            torch.save(checkpoint, checkpoint_path)
            print(f"Checkpoint saved at epoch {epoch}, step {step}")

        start_step = 0  # Reset start step for the next epochs