In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch


In [6]:
from pathlib import Path
DS_PATH = Path.cwd()/'data'

In [7]:

# Load the data
df = pd.read_csv(DS_PATH/'synth-itr1-n166.csv')

# Display class distribution
print(df['label'].value_counts())


label
Negative      122
Positive       34
Not Stated     10
Name: count, dtype: int64


In [8]:
# Balance the dataset
def balance_dataset(df):
    min_class_count = df['label'].value_counts().min()
    balanced_df = df.groupby('label').apply(lambda x: x.sample(min_class_count)).reset_index(drop=True)
    return balanced_df

balanced_df = balance_dataset(df)
print(balanced_df['label'].value_counts())

# Plot class distribution
plt.figure(figsize=(10, 5))
balanced_df['label'].value_counts().plot(kind='bar')
plt.title('Balanced Class Distribution')
plt.xlabel('Label')
plt.ylabel('Count')
plt.savefig('balanced_class_distribution.png')
plt.close()

# Encode labels
label_encoding = {'Positive': 0, 'Negative': 1, 'Not Stated': 2}
balanced_df['label_encoded'] = balanced_df['label'].map(label_encoding)

# Split the data
train_df, temp_df = train_test_split(balanced_df, test_size=0.3, stratify=balanced_df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# Custom dataset
class ReportDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['report_text']
        label = self.data.iloc[idx]['label_encoded']
        
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=3)
model.config.pad_token_id = model.config.eos_token_id

# Create datasets and dataloaders
max_length = 128
batch_size = 16

train_dataset = ReportDataset(train_df, tokenizer, max_length)
val_dataset = ReportDataset(val_df, tokenizer, max_length)
test_dataset = ReportDataset(test_df, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training function
def train(model, train_loader, val_loader, optimizer, num_epochs, device):
    model.to(device)
    
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        model.eval()
        total_val_loss = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                
                total_val_loss += loss.item()
        
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
    
    return train_losses, val_losses

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

train_losses, val_losses = train(model, train_loader, val_loader, optimizer, num_epochs, device)

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.savefig('loss_plot.png')
plt.close()

# Evaluate on test set
model.eval()
test_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        test_loss += loss.item()
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

avg_test_loss = test_loss / len(test_loader)
accuracy = 100 * correct / total

print(f'Test Loss: {avg_test_loss:.4f}, Accuracy: {accuracy:.2f}%')

label
Negative      10
Not Stated    10
Positive      10
Name: count, dtype: int64


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Train Loss: 5.9279, Val Loss: 4.3045
Epoch 2/3, Train Loss: 2.5090, Val Loss: 1.5259
Epoch 3/3, Train Loss: 0.8627, Val Loss: 0.6143
Test Loss: 0.6424, Accuracy: 60.00%
