In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/h2oai-predict-the-llm/sample_submission.csv
/kaggle/input/h2oai-predict-the-llm/train.csv
/kaggle/input/h2oai-predict-the-llm/test.csv


In [3]:
# -*- coding: utf-8 -*-
"""
H2O.ai - LLM Science Exam: Predict the LLM
-------------------------------------------

This script provides an end-to-end solution for the Kaggle competition "H2O.ai - LLM Science Exam".
The goal is to identify which of 7 Large Language Models (LLMs) generated a given response
to a question.

Approach:
1.  **Model Selection**: We use a pre-trained `microsoft/deberta-v3-base` model, a powerful
    transformer architecture well-suited for understanding nuanced text. We add a classification
    head on top to predict one of the 7 LLM classes.

2.  **Input Formatting**: The `Question` and `Response` texts are concatenated into a single
    input sequence for the model, separated by a special `[SEP]` token. This allows the model
    to learn the relationship between the prompt and the generated answer.
    Example: `[CLS] What is the capital of France? [SEP] The capital of France is Paris. [SEP]`

3.  **Frameworks**: The solution is built using PyTorch for model training and inference, and the
    Hugging Face `transformers` library for easy access to the pre-trained model and tokenizer.

4.  **Training**: The script trains the model on the full training dataset for a few epochs.
    This is a common and effective strategy for competitions, especially when building a baseline.
    The evaluation metric is Log Loss, so the model is trained to output class probabilities using
    a standard Cross-Entropy Loss function.

5.  **Inference & Submission**: After training, the script runs inference on the test set,
    calculates the probabilities for each of the 7 classes using a Softmax function, and
    formats the results into the required `submission.csv` file.

This script is self-contained and ready to run in a Kaggle environment with GPU acceleration.
"""

# =====================================================================================
# 1. SETUP AND IMPORTS
# =====================================================================================
import os
import gc
import warnings
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW # <<< CORRECTED IMPORT
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

# Suppress warnings
warnings.filterwarnings("ignore")

# =====================================================================================
# 2. CONFIGURATION
# =====================================================================================
class CFG:
    """
    Configuration class for all hyperparameters and settings.
    """
    # General
    seed = 42
    num_workers = os.cpu_count() or 2
    
    # Paths
    data_dir = "/kaggle/input/h2oai-predict-the-llm/"
    
    # Model: DeBERTa-v3 is a strong baseline for text classification tasks.
    model_name = "microsoft/deberta-v3-base"
    
    # Training Parameters
    epochs = 3
    train_batch_size = 8
    eval_batch_size = 16
    lr = 2e-5
    max_length = 512  # Max sequence length for the transformer
    num_labels = 7    # Corresponds to the 7 different LLM models
    
    # Hardware
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =====================================================================================
# 3. UTILITY FUNCTIONS
# =====================================================================================
def set_seed(seed_value):
    """
    Sets the seed for reproducibility in numpy and torch.
    """
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Seed set to {seed_value}")

# =====================================================================================
# 4. DATASET PREPARATION
# =====================================================================================
class LLMDataset(Dataset):
    """
    Custom PyTorch Dataset for loading and tokenizing the data.
    """
    def __init__(self, df, tokenizer, max_length, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        # Combine 'Question' and 'Response' into a single text entry
        self.texts = (df['Question'] + self.tokenizer.sep_token + df['Response']).tolist()
        if not self.is_test:
            self.labels = df['target'].tolist()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # Tokenize the combined text
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        # Squeeze to remove the batch dimension from tokenization
        ids = inputs['input_ids'].squeeze(0)
        mask = inputs['attention_mask'].squeeze(0)
        token_type_ids = inputs['token_type_ids'].squeeze(0)
        
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids
            }
        else:
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
                'labels': torch.tensor(self.labels[idx], dtype=torch.long)
            }

# =====================================================================================
# 5. TRAINING AND INFERENCE LOGIC
# =====================================================================================
def train_one_epoch(model, train_loader, optimizer, scheduler, device):
    """
    Performs one full epoch of training.
    """
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training", leave=False)
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        # Move batch to device
        ids = batch['ids'].to(device)
        mask = batch['mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            token_type_ids=token_type_ids,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")
        
    avg_loss = total_loss / len(train_loader)
    return avg_loss

def predict_probabilities(model, test_loader, device):
    """
    Runs inference on the test set and returns class probabilities.
    """
    model.eval()
    all_preds = []
    
    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc="Predicting", leave=False)
        for batch in progress_bar:
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids
            )
            
            # Get logits and apply softmax to convert to probabilities
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            all_preds.append(probs)
            
    return np.vstack(all_preds)

# =====================================================================================
# 6. MAIN EXECUTION SCRIPT
# =====================================================================================
def main():
    print("--- Starting LLM Identification Program ---")
    
    # Set seed for deterministic results
    set_seed(CFG.seed)
    
    print(f"Using device: {CFG.device}")
    
    # Load data
    print("\nLoading data...")
    train_df = pd.read_csv(os.path.join(CFG.data_dir, "train.csv"))
    test_df = pd.read_csv(os.path.join(CFG.data_dir, "test.csv"))
    submission_df = pd.read_csv(os.path.join(CFG.data_dir, "sample_submission.csv"))
    print(f"Training data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
    
    # Initialize tokenizer
    print(f"\nInitializing tokenizer: {CFG.model_name}")
    tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
    
    # Create datasets and dataloaders
    print("\nCreating datasets and dataloaders...")
    train_dataset = LLMDataset(train_df, tokenizer, CFG.max_length, is_test=False)
    test_dataset = LLMDataset(test_df, tokenizer, CFG.max_length, is_test=True)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.train_batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=CFG.eval_batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True
    )
    
    # Initialize model
    print(f"\nInitializing model: {CFG.model_name}")
    model = AutoModelForSequenceClassification.from_pretrained(
        CFG.model_name, 
        num_labels=CFG.num_labels
    )
    model.to(CFG.device)
    
    # Setup optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=CFG.lr)
    num_training_steps = len(train_loader) * CFG.epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0, # No warmup steps for this baseline
        num_training_steps=num_training_steps
    )
    
    # --- Training Loop ---
    # For a more robust solution, one would use K-Fold Cross-Validation.
    # For this script, we train on the full training data.
    print("\n--- Starting Training ---")
    for epoch in range(CFG.epochs):
        print(f"\n--- Epoch {epoch + 1}/{CFG.epochs} ---")
        avg_loss = train_one_epoch(model, train_loader, optimizer, scheduler, CFG.device)
        print(f"Epoch {epoch + 1} - Average Training Loss: {avg_loss:.4f}")
    
    print("\n--- Training Finished ---")
    
    # --- Inference ---
    print("\n--- Starting Inference on Test Data ---")
    predictions = predict_probabilities(model, test_loader, CFG.device)
    
    # --- Create Submission File ---
    print("\nGenerating submission file...")
    target_cols = [f'target_{i}' for i in range(CFG.num_labels)]
    submission_df[target_cols] = predictions
    
    submission_df.to_csv('submission.csv', index=False)
    print("\nSubmission file 'submission.csv' created successfully!")
    print(submission_df.head())
    
    # Clean up GPU memory
    del model, train_loader, test_loader, train_dataset, test_dataset
    gc.collect()
    torch.cuda.empty_cache()
    
    print("\n--- Program Finished ---")

if __name__ == "__main__":
    main()

--- Starting LLM Identification Program ---
Seed set to 42
Using device: cuda

Loading data...
Training data shape: (3976, 3)
Test data shape: (1001, 3)

Initializing tokenizer: microsoft/deberta-v3-base


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]


Creating datasets and dataloaders...

Initializing model: microsoft/deberta-v3-base


2025-06-29 15:10:04.436314: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751209804.652546      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751209804.711599      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Training ---

--- Epoch 1/3 ---


Training:   0%|          | 0/497 [00:00<?, ?it/s]

TypeError: Caught TypeError in DataLoader worker process 1.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/tmp/ipykernel_35/4187207570.py", line 126, in __getitem__
    inputs = self.tokenizer.encode_plus(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py", line 3073, in encode_plus
    return self._encode_plus(
           ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_fast.py", line 613, in _encode_plus
    batched_output = self._batch_encode_plus(
                     ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_fast.py", line 539, in _batch_encode_plus
    encodings = self._tokenizer.encode_batch(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
