In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# =========================================================================================
# === 1. SETUP AND CONFIGURATION ========================================================
# =========================================================================================

import os
import pandas as pd
import numpy as np
import torch
import random

# Scikit-learn for metrics and data splitting
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    roc_auc_score, 
    log_loss
)

# Transformers and Datasets libraries from Hugging Face
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset, load_metric

# Set a seed for reproducibility
def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

# Configuration class to hold all hyperparameters
class CFG:
    # General
    seed = 42
    
    # Kaggle environment paths
    data_path = "/kaggle/input/h2oai-predict-the-llm"
    output_path = "/kaggle/working/"
    
    # Model configuration
    model_name = "microsoft/deberta-v3-base"
    
    # Training hyperparameters
    n_splits = 5
    max_length = 512
    learning_rate = 2e-5
    train_batch_size = 8  # Per device batch size
    eval_batch_size = 16  # Per device batch size
    epochs = 3
    weight_decay = 0.01
    
    # Task specific
    num_classes = 7
    target_cols = [f'target_{i}' for i in range(num_classes)]

# Apply the seed
set_seed(CFG.seed)

# =========================================================================================
# === 2. LOAD AND PREPARE DATA ==========================================================
# =========================================================================================

print("Loading data...")
# Load datasets from CSV files
train_df = pd.read_csv(f"{CFG.data_path}/train.csv")
test_df = pd.read_csv(f"{CFG.data_path}/test.csv")
submission_df = pd.read_csv(f"{CFG.data_path}/sample_submission.csv")

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Rename the target column for compatibility with Hugging Face's Trainer
# The Trainer API expects the target column to be named 'label'
train_df.rename(columns={'target': 'label'}, inplace=True)

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# =========================================================================================
# === 3. TOKENIZATION ===================================================================
# =========================================================================================

print("Initializing tokenizer...")
# Load the tokenizer associated with the chosen model
# DeBERTa-v3 uses a SentencePiece-based tokenizer
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

def tokenize_function(examples):
    """
    Tokenizes the input text by treating Question and Response as a pair.
    The tokenizer will automatically format this as: [CLS] Question [SEP] Response [SEP]
    """
    return tokenizer(
        examples['Question'], 
        examples['Response'], 
        truncation=True, 
        max_length=CFG.max_length, 
        padding='max_length' # Pad to max_length for consistent tensor shapes
    )

print("Tokenizing datasets...")
# Apply the tokenization function to the entire datasets
# Using batched=True for faster processing
tokenized_train_ds = train_dataset.map(tokenize_function, batched=True)
tokenized_test_ds = test_dataset.map(tokenize_function, batched=True)

# Data collator will dynamically pad sentences in each batch to the longest length
# This is more efficient than padding all sentences to max_length globally
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# =========================================================================================
# === 4. METRICS COMPUTATION ============================================================
# =========================================================================================

def compute_metrics(eval_pred):
    """
    Computes and returns a dictionary of evaluation metrics.
    This function is passed to the Trainer.
    """
    # Unpack predictions and true labels
    logits, labels = eval_pred
    
    # Get probabilities by applying softmax to logits
    probabilities = torch.nn.functional.softmax(torch.from_numpy(logits), dim=-1).numpy()
    
    # Get predicted class by finding the index of the max logit
    predictions = np.argmax(logits, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    
    # For multi-class ROC AUC, we need probabilities and 'ovr' (one-vs-rest) strategy
    roc_auc_ovr = roc_auc_score(labels, probabilities, multi_class='ovr')
    roc_auc_ovr_weighted = roc_auc_score(labels, probabilities, multi_class='ovr', average='weighted')
    
    # Log loss requires probabilities
    loss = log_loss(labels, probabilities)
    
    return {
        'accuracy': accuracy,
        'log_loss': loss,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'roc_auc_ovr': roc_auc_ovr,
        'roc_auc_ovr_weighted': roc_auc_ovr_weighted,
    }

# =========================================================================================
# === 5. CROSS-VALIDATION TRAINING AND INFERENCE ========================================
# =========================================================================================

# Prepare for cross-validation
skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)

# Arrays to store out-of-fold (OOF) and test predictions
oof_predictions = np.zeros((len(train_df), CFG.num_classes))
test_predictions = np.zeros((len(test_df), CFG.num_classes))
oof_labels = np.zeros(len(train_df))

# Start the cross-validation loop
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
    print("-" * 50)
    print(f"=============== FOLD {fold + 1}/{CFG.n_splits} ===============")
    print("-" * 50)
    
    # --- Create datasets for the current fold ---
    train_fold_ds = tokenized_train_ds.select(train_idx)
    val_fold_ds = tokenized_train_ds.select(val_idx)
    
    # Store true labels for the validation set for final OOF evaluation
    oof_labels[val_idx] = train_df['label'].iloc[val_idx].values
    
    # --- Initialize Model ---
    model = AutoModelForSequenceClassification.from_pretrained(
        CFG.model_name, 
        num_labels=CFG.num_classes
    )
    
    # --- Define Training Arguments ---
    training_args = TrainingArguments(
        output_dir=f"{CFG.output_path}/fold_{fold}",
        # Training Strategy
        learning_rate=CFG.learning_rate,
        per_device_train_batch_size=CFG.train_batch_size,
        per_device_eval_batch_size=CFG.eval_batch_size,
        num_train_epochs=CFG.epochs,
        weight_decay=CFG.weight_decay,
        fp16=True,  # Use mixed precision for speed and memory efficiency
        
        # Evaluation and Checkpointing
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,  # Load the best model based on the metric
        metric_for_best_model="log_loss", # Use log_loss as the primary metric
        greater_is_better=False,       # Lower log_loss is better
        
        # Logging and Reporting
        logging_strategy="epoch",
        report_to="none", # Disable reporting to external services like wandb
        save_total_limit=1, # Only keep the best checkpoint
    )
    
    # --- Initialize Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_fold_ds,
        eval_dataset=val_fold_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # --- Train the model ---
    print("Starting training for fold...", fold+1)
    trainer.train()
    
    # --- Generate Predictions ---
    print("Generating OOF predictions for fold...", fold+1)
    # Get predictions on the validation set for this fold
    val_preds = trainer.predict(val_fold_ds)
    oof_predictions[val_idx] = torch.nn.functional.softmax(torch.from_numpy(val_preds.predictions), dim=-1).numpy()
    
    print("Generating Test predictions for fold...", fold+1)
    # Get predictions on the test set
    test_preds = trainer.predict(tokenized_test_ds)
    # Average test predictions over folds
    test_predictions += torch.nn.functional.softmax(torch.from_numpy(test_preds.predictions), dim=-1).numpy() / CFG.n_splits
    
    # --- Clean up resources for the next fold ---
    del model, trainer
    torch.cuda.empty_cache()

# =========================================================================================
# === 6. FINAL EVALUATION AND SUBMISSION ================================================
# =========================================================================================

print("\n" + "=" * 50)
print("====== CROSS-VALIDATION FINISHED ======")
print("=" * 50)

# Calculate final OOF metrics on the entire training set
print("Calculating final OOF metrics...")
final_oof_metrics = compute_metrics((oof_predictions, oof_labels))

print("\nFinal OOF Metrics:")
for metric, value in final_oof_metrics.items():
    print(f"- {metric}: {value:.5f}")

# --- Create submission file ---
print("\nCreating submission file...")
submission_df[CFG.target_cols] = test_predictions
submission_df.to_csv("submission.csv", index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print(submission_df.head())