In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv


In [None]:
train = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
train.head()

In [2]:
train = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
test = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

In [None]:
def create_input_text(row):
    return f"<s>Prompt: {row['prompt']}</s><s>Option A: {row['response_a']}</s><s>Option B: {row['response_b']}</s>"

# Add special tokens to clearly delineate sections
train["input_text"] = train.apply(create_input_text, axis=1)

In [None]:
test = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
# Combine prompt and responses into a single input text
test["input_text"] = test.apply(create_input_text, axis=1)

# Print sample test data
print(test.head())

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
from tqdm import tqdm

def prepare_input(row, tokenizer, max_length=512):
    """Prepare single input row for model"""
    input_text = f"<s>Prompt: {row['prompt']}</s><s>Option A: {row['response_a']}</s><s>Option B: {row['response_b']}</s>"
    
    encoding = tokenizer(
        input_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_dict = {
        'input_ids': encoding['input_ids'].squeeze(),
        'attention_mask': encoding['attention_mask'].squeeze(),
    }
    
    if 'winner_model_a' in row:
        input_dict['target'] = torch.tensor([
            row['winner_model_a'],
            row['winner_model_a'],
            row['winner_tie']
        ], dtype=torch.float)
    
    return input_dict

def create_dataloader(df, tokenizer, batch_size=16, shuffle=True):
    """Create dataloader from dataframe"""
    dataset = [prepare_input(row, tokenizer) for _, row in df.iterrows()]
    
    # Collate function to handle batching
    def collate_fn(batch):
        input_ids = torch.stack([item['input_ids'] for item in batch])
        attention_mask = torch.stack([item['attention_mask'] for item in batch])
        
        if 'target' in batch[0]:
            targets = torch.stack([item['target'] for item in batch])
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'target': targets
            }
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }
    
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

def augment_data(df):
    """Augment training data by swapping responses"""
    augmented_rows = []
    
    for _, row in df.iterrows():
        # Original row
        augmented_rows.append({
            'prompt': row['prompt'],
            'response_a': row['response_a'],
            'response_b': row['response_b'],
            'winner_model_a': row['winner_model_a'],
            'winner_model_b': row['winner_model_b'],
            'winner_tie': row['winner_tie']
        })
        
        # Swapped version
        augmented_rows.append({
            'prompt': row['prompt'],
            'response_a': row['response_b'],
            'response_b': row['response_a'],
            'winner_model_a': row['winner_model_b'],
            'winner_model_b': row['winner_model_a'],
            'winner_tie': row['winner_tie']
        })
    
    return pd.DataFrame(augmented_rows)

def create_model(model_name='roberta-base'):
    """Create model with custom head"""
    roberta = RobertaModel.from_pretrained(model_name)
    
    # Add custom layers
    attention_pooling = nn.Sequential(
        nn.Linear(768, 1),
        nn.Softmax(dim=1)
    )
    
    classifier = nn.Sequential(
        nn.Linear(768, 512),
        nn.LayerNorm(512),
        nn.Dropout(0.1),
        nn.ReLU(),
        nn.Linear(512, 128),
        nn.LayerNorm(128),
        nn.Dropout(0.1),
        nn.ReLU(),
        nn.Linear(128, 3)
    )
    
    return roberta, attention_pooling, classifier

def forward_pass(batch, roberta, attention_pooling, classifier, device):
    """Forward pass through the model"""
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    
    outputs = roberta(input_ids, attention_mask=attention_mask)
    hidden_states = outputs.last_hidden_state
    
    # Attention pooling
    attention_weights = attention_pooling(hidden_states)
    pooled = torch.sum(attention_weights * hidden_states, dim=1)
    
    # Classification
    logits = classifier(pooled)
    return torch.softmax(logits, dim=1)

def compute_loss(predictions, targets):
    """Compute loss with label smoothing"""
    targets = targets * 0.9 + 0.033  # Label smoothing
    loss = -torch.sum(targets * torch.log(predictions + 1e-7), dim=1)
    return loss.mean()

def train_epoch(train_loader, roberta, attention_pooling, classifier, optimizer, scheduler, device):
    """Train for one epoch"""
    roberta.train()
    attention_pooling.train()
    classifier.train()
    
    total_loss = 0
    progress_bar = tqdm(train_loader, desc='Training')
    
    for batch in progress_bar:
        outputs = forward_pass(batch, roberta, attention_pooling, classifier, device)
        targets = batch['target'].to(device)
        
        loss = compute_loss(outputs, targets)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(roberta.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{total_loss/len(train_loader):.3f}'})
    
    return total_loss / len(train_loader)

def evaluate(val_loader, roberta, attention_pooling, classifier, device):
    """Evaluate model"""
    roberta.eval()
    attention_pooling.eval()
    classifier.eval()
    
    predictions = []
    targets = []
    
    with torch.no_grad():
        for batch in val_loader:
            outputs = forward_pass(batch, roberta, attention_pooling, classifier, device)
            predictions.extend(outputs.cpu().numpy())
            targets.extend(batch['target'].numpy())
    
    predictions = np.array(predictions)
    targets = np.array(targets)
    
    pred_labels = np.argmax(predictions, axis=1)
    target_labels = np.argmax(targets, axis=1)
    
    return {
        'accuracy': accuracy_score(target_labels, pred_labels),
        'f1': f1_score(target_labels, pred_labels, average='weighted')
    }

def predict(test_loader, roberta, attention_pooling, classifier, device):
    """Generate predictions for test data"""
    roberta.eval()
    attention_pooling.eval()
    classifier.eval()
    
    predictions = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Predicting'):
            outputs = forward_pass(batch, roberta, attention_pooling, classifier, device)
            predictions.extend(outputs.cpu().numpy())
    
    return np.array(predictions)

def train_model(train_loader, val_loader, device, num_epochs=3):
    """Main training function"""
    roberta, attention_pooling, classifier = create_model()
    roberta = roberta.to(device)
    attention_pooling = attention_pooling.to(device)
    classifier = classifier.to(device)
    
    # Optimizer
    optimizer = torch.optim.AdamW([
        {'params': roberta.parameters(), 'lr': 2e-5},
        {'params': attention_pooling.parameters(), 'lr': 1e-4},
        {'params': classifier.parameters(), 'lr': 1e-4}
    ])
    
    # Scheduler
    num_training_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_training_steps * 0.1,
        num_training_steps=num_training_steps
    )
    
    best_f1 = 0
    best_state = None
    
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')
        
        train_loss = train_epoch(
            train_loader, roberta, attention_pooling, classifier,
            optimizer, scheduler, device
        )
        
        metrics = evaluate(val_loader, roberta, attention_pooling, classifier, device)
        print(f'Validation Accuracy: {metrics["accuracy"]:.3f}')
        print(f'Validation F1: {metrics["f1"]:.3f}')
        
        if metrics['f1'] > best_f1:
            best_f1 = metrics['f1']
            best_state = {
                'roberta': roberta.state_dict(),
                'attention_pooling': attention_pooling.state_dict(),
                'classifier': classifier.state_dict()
            }
    
    return best_state

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


# Augment training data
train_df = augment_data(train)

# Create dataloaders
train_loader = create_dataloader(train_df, tokenizer, batch_size=16, shuffle=True)
test_loader = create_dataloader(test, tokenizer, batch_size=32, shuffle=False)

# Train model
best_state = train_model(train_loader, train_loader, device)  # Using train as validation for example

# Load best model
roberta, attention_pooling, classifier = create_model()
roberta.load_state_dict(best_state['roberta'])
attention_pooling.load_state_dict(best_state['attention_pooling'])
classifier.load_state_dict(best_state['classifier'])
    
roberta = roberta.to(device)
attention_pooling = attention_pooling.to(device)
classifier = classifier.to(device)

# Generate predictions
predictions = predict(test_loader, roberta, attention_pooling, classifier, device)
pred_labels = np.argmax(predictions, axis=1)

# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'prediction': ['a' if p == 0 else 'b' if p == 1 else 'tie' for p in pred_labels]
})

submission.to_csv('submission.csv', index=False)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training:   6%|▌         | 403/7185 [5:05:43<88:08:28, 46.79s/it, loss=0.062]