In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_cosine_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt


In [4]:
# Load data
combined_df = pd.read_csv('dataset/CS-Abstract-Combined-Preprocessed-Dataset.csv')
combined_df = combined_df[['text', 'label']]

# Split data
train_size = int(0.8 * len(combined_df))
val_size = int(0.1 * len(combined_df))
test_size = len(combined_df) - train_size - val_size

train_df, val_df, test_df = np.split(combined_df.sample(frac=1), [train_size, train_size + val_size])


In [5]:
# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenize
def tokenize_data(df):
    inputs = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt", max_length=512)
    labels = torch.tensor(df['label'].tolist())
    return TensorDataset(inputs.input_ids, inputs.attention_mask, labels)

train_dataset = tokenize_data(train_df)
val_dataset = tokenize_data(val_df)
test_dataset = tokenize_data(test_df)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


In [6]:
# Initialize model
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
learning_rate = 5e-4
weight_decay = 1e-3
epochs = 5

# Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)




In [7]:
# Initialize lists to store metrics
train_loss_list, val_loss_list = [], []
train_acc_list, val_acc_list = [], []

# Cross-entropy loss
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(epochs):
    # Training
    model.train()
    train_loss, train_corrects = 0, 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to("cuda" if torch.cuda.is_available() else "cpu") for x in batch]
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        
        # Loss calculation
        loss = criterion(outputs.logits.view(-1, 2), labels.view(-1))
        
        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()
        train_corrects += (torch.argmax(outputs.logits.view(-1, 2), dim=1) == labels.view(-1)).sum().item()
        
    train_acc = train_corrects / len(train_loader.dataset)
    train_loss_list.append(train_loss / len(train_loader))
    train_acc_list.append(train_acc)
    
    # Validation
    model.eval()
    val_loss, val_corrects = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [x.to("cuda" if torch.cuda.is_available() else "cpu") for x in batch]
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            
            loss = criterion(outputs.logits.view(-1, 2), labels.view(-1))
            
            val_loss += loss.item()
            val_corrects += (torch.argmax(outputs.logits.view(-1, 2), dim=1) == labels.view(-1)).sum().item()
    
    val_acc = val_corrects / len(val_loader.dataset)
    val_loss_list.append(val_loss / len(val_loader))
    val_acc_list.append(val_acc)
    
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss / len(train_loader)}, Train Acc: {train_acc}, Val Loss: {val_loss / len(val_loader)}, Val Acc: {val_acc}")


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB (GPU 0; 21.96 GiB total capacity; 9.37 GiB already allocated; 319.81 MiB free; 9.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Plotting loss and accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_loss_list, label='Train Loss')
plt.plot(val_loss_list, label='Val Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss per Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_acc_list, label='Train Acc')
plt.plot(val_acc_list, label='Val Acc')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy per Epoch')
plt.legend()

plt.show()


In [None]:
# Test set evaluation
model.eval()
y_true, y_pred, y_prob = [], [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to("cuda" if torch.cuda.is_available() else "cpu") for x in batch]
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        
        preds = torch.argmax(outputs.logits.view(-1, 2), dim=1)
        
        y_true.extend(labels.view(-1).cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
        y_prob.extend(torch.softmax(outputs.logits.view(-1, 2), dim=1)[:, 1].cpu().numpy())

# Calculate Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auroc = roc_auc_score(y_true, y_prob)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, AUROC: {auroc}")

# ROC Curve
fpr, tpr, _ = roc_curve(y_true, y_prob)
plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (area = {auroc})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

# Calculate false positive rate and false negative rate
fpr = fp / (fp + tn)
fnr = fn / (fn + tp)

# Plot DET curve
plt.figure()
plt.plot(fpr, fnr, label='DET curve')
plt.xlabel('False Positive Rate')
plt.ylabel('False Negative Rate')
plt.title('Detection Error Tradeoff')
plt.legend(loc='upper right')
plt.show()
