In [None]:
# -------- Cell 2: Install + Imports --------

# Core imports
import os
import sys
import pandas as pd
import torch
from sklearn.model_selection import KFold
import warnings

# HuggingFace imports
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import transformers

# Silence annoying warnings
warnings.filterwarnings("ignore")
transformers.logging.set_verbosity_error()

print("Torch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("CUDA available:", torch.cuda.is_available())

In [None]:
# -------- Cell 3: Load Spelling Corrections + Normalize Function --------

# Load spelling corrections
spelling_path = "/kaggle/input/more-data-map/spelling_corrections_v1.csv"
spelling_df = pd.read_csv(spelling_path)
spelling_dict = dict(zip(spelling_df['misspelled'], spelling_df['correct']))

print(f"Loaded {len(spelling_dict)} spelling corrections")

def normalize_text(text):
    """Normalize text using spelling corrections."""
    if pd.isna(text):
        return ""
    
    text = str(text)
    words = text.split()
    corrected_words = [spelling_dict.get(word, word) for word in words]
    return " ".join(corrected_words)

In [None]:
# -------- Cell 4: Load Data + Normalize + Build Text Column --------

# Load train_73k_normalized.csv from Kaggle input
train_path = "/kaggle/input/more-data-map/train_73k_normalized.csv"
train = pd.read_csv(train_path, low_memory=False)

print(f"Loaded {len(train)} samples")

# Build text column from normalized columns
train["text"] = (
    train["QuestionText_Norm"].fillna("").astype(str) + " " +
    train["MC_Answer_Norm"].fillna("").astype(str) + " " +
    train["StudentExplanation_Norm"].fillna("").astype(str)
).str.strip()

# Label encoding for target column
unique_labels = sorted(train["target"].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

train["label"] = train["target"].map(label2id)

print(f"Total unique target labels: {len(unique_labels)}")
print(f"Sample labels: {list(unique_labels)[:5]}")

train.head()

In [None]:
# -------- Cell 5: Tokenizer + Dataset Class + KFold --------

# --- 1. Load Tokenizer ---
MODEL_NAME = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Tokenizer loaded!")

# --- 2. Dataset Class ---
class TargetDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# --- 3. K-Fold Setup ---
NUM_FOLDS = 5
kfold = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

print("KFold ready!")

In [None]:
# -------- Cell 6: Model Definition + Device --------

# Create a fresh model for each fold
def create_model():
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(unique_labels),
        id2label=id2label,
        label2id=label2id
    )
    return model

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

: 

In [None]:
# -------- Cell 7: Full K-Fold Training Loop --------

EPOCHS = 5
PATIENCE = 2

for fold, (train_idx, valid_idx) in enumerate(kfold.split(train)):
    print(f"\n==============================")
    print(f"===== FOLD {fold+1}/{NUM_FOLDS} =====")
    print(f"==============================")

    # ---- Split dataframe ----
    train_df = train.iloc[train_idx].reset_index(drop=True)
    valid_df = train.iloc[valid_idx].reset_index(drop=True)

    # ---- Create datasets ----
    train_ds = TargetDataset(train_df, tokenizer)
    valid_ds = TargetDataset(valid_df, tokenizer)

    # ---- Dataloaders ----
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=8, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=8, shuffle=False)

    print(f"Train batches: {len(train_loader)}, Valid batches: {len(valid_loader)}")

    # ---- Fresh model for this fold ----
    model = create_model()
    model.to(device)

    # ---- Optimizer ----
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    # ---- Best model tracking ----
    best_val_loss = float("inf")
    bad_epochs = 0

    # =====================================
    #           TRAINING EPOCHS
    # =====================================
    for epoch in range(EPOCHS):
        print(f"\n----- Epoch {epoch+1}/{EPOCHS} -----")

        # ===== TRAINING =====
        model.train()
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for i, batch in enumerate(train_loader):
            if i % 500 == 0:
                print(f"  Training batch {i}/{len(train_loader)}")
            
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            
            # Calculate accuracy
            preds = torch.argmax(outputs.logits, dim=1)
            total_correct += (preds == batch["labels"]).sum().item()
            total_samples += len(batch["labels"])

        avg_train_loss = total_loss / len(train_loader)
        accuracy = total_correct / total_samples * 100
        
        print(f"Train Loss: {avg_train_loss:.4f}")
        print(f"Train Accuracy: {accuracy:.2f}%")

        # ===== VALIDATION =====
        model.eval()
        valid_loss = 0
        valid_correct = 0
        valid_samples = 0

        with torch.no_grad():
            for i, batch in enumerate(valid_loader):
                if i % 500 == 0:
                    print(f"  Validation batch {i}/{len(valid_loader)}")
                
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                valid_loss += outputs.loss.item()
                
                # Calculate accuracy
                preds = torch.argmax(outputs.logits, dim=1)
                valid_correct += (preds == batch["labels"]).sum().item()
                valid_samples += len(batch["labels"])

        avg_val_loss = valid_loss / len(valid_loader)
        val_accuracy = valid_correct / valid_samples * 100
        
        print(f"Valid Loss: {avg_val_loss:.4f}")
        print(f"Valid Accuracy: {val_accuracy:.2f}%")

        # ===== CHECK IF BEST MODEL =====
        if avg_val_loss < best_val_loss:
            print("ðŸ”¥ New best model! Saving checkpoint...")
            best_val_loss = avg_val_loss
            bad_epochs = 0

            # Save model state and mappings
            save_dict = {
                'model_state_dict': model.state_dict(),
                'label2id': label2id,
                'id2label': id2label,
                'num_labels': len(unique_labels)
            }
            
            save_path = f"/kaggle/working/deberta_combined_fold{fold+1}.pt"
            torch.save(save_dict, save_path)
            print(f"Model saved to {save_path}")
        else:
            bad_epochs += 1
            print(f"No improvement ({bad_epochs}/{PATIENCE} bad epochs)")

        # ===== EARLY STOPPING =====
        if bad_epochs >= PATIENCE:
            print("â›” Early stopping triggered â€” stopping training for this fold.")
            break

print("\nðŸ”¥ All folds completed! Models saved in /kaggle/working/")