In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv
/kaggle/input/map-charting-student-math-misunderstandings/train.csv
/kaggle/input/map-charting-student-math-misunderstandings/test.csv


In [2]:
# -------- Cell 2: Install + Imports --------

# Install HuggingFace Transformers (Kaggle often has an old version)
#!pip install -q transformers accelerate

# Core imports
import os
import sys
import pandas as pd
import torch
from sklearn.model_selection import KFold
from tqdm import tqdm
import warnings

# HuggingFace imports
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import transformers

# Silence annoying warnings
warnings.filterwarnings("ignore")
transformers.logging.set_verbosity_error()

print("Torch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("CUDA available:", torch.cuda.is_available())

Torch version: 2.6.0+cu124
Transformers version: 4.53.3
CUDA available: True


In [3]:
# -------- Cell 3: Preprocess Functions --------
import pandas as pd
import re

def clean_text(s):
    """Basic cleaning for TF-IDF."""
    if pd.isna(s):
        return ""
    s = str(s)

    # remove punctuation that doesn't help TF-IDF
    s = re.sub(r'[,!?;"\'\[\]\{\}]', ' ', s)

    # collapse multiple spaces
    s = re.sub(r"\s+", " ", s).strip()

    return s


def build_text_column(df):
    """
    Combined text for TF-IDF (not used for DeBERTa but kept for completeness).
    """
    q = df["QuestionText"].fillna("").apply(clean_text)
    a = df["MC_Answer"].fillna("").apply(clean_text)
    e = df["StudentExplanation"].fillna("").apply(clean_text)

    df["text"] = q + " " + a + " " + e
    return df


def build_text_columns_bert(df):
    """
    Builds the combined 'text' column for transformer models (DeBERTa/BERT).
    No cleaning is applied â€” transformers want raw text.
    """
    df["Misconception"] = df["Misconception"].fillna("NA").astype(str)

    q = df["QuestionText"].fillna("").astype(str)
    a = df["MC_Answer"].fillna("").astype(str)
    e = df["StudentExplanation"].fillna("").astype(str)

    df["text"] = (q + " " + a + " " + e).str.strip()
    return df

In [4]:
# -------- Cell 4: Load Data + Preprocess + Label Encoding --------

# Load train.csv from Kaggle input
train_path = "/kaggle/input/map-charting-student-math-misunderstandings/train.csv"
train = pd.read_csv(train_path)

# Build the combined text column for transformers (raw text)
train = build_text_columns_bert(train)

# Label encoding for Category (6 labels)
unique_labels = sorted(train["Category"].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

train["label"] = train["Category"].map(label2id)

print("Unique labels:", unique_labels)
print("Label2id:", label2id)

train.head()

Unique labels: ['False_Correct', 'False_Misconception', 'False_Neither', 'True_Correct', 'True_Misconception', 'True_Neither']
Label2id: {'False_Correct': 0, 'False_Misconception': 1, 'False_Neither': 2, 'True_Correct': 3, 'True_Misconception': 4, 'True_Neither': 5}


Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception,text,label
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,,What fraction of the shape is not shaded? Give...,3
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,,What fraction of the shape is not shaded? Give...,3
2,2,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"1 3rd is half of 3 6th, so it is simplee to un...",True_Neither,,What fraction of the shape is not shaded? Give...,5
3,3,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 goes into everything and 3 goes into nine,True_Neither,,What fraction of the shape is not shaded? Give...,5
4,4,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 out of every 3 isn't coloured,True_Correct,,What fraction of the shape is not shaded? Give...,3


In [5]:
# -------- Cell 5: Tokenizer, Dataset Class, KFold --------

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import KFold
from tqdm import tqdm

# --- 1. Load Tokenizer (from HuggingFace) ---
MODEL_NAME = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Tokenizer loaded!")

# --- 2. Dataset Class (same as your local version) ---
class MAPDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# --- 3. K-Fold Setup ---
NUM_FOLDS = 5
kfold = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

print("KFold ready:", kfold)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Tokenizer loaded!
KFold ready: KFold(n_splits=5, random_state=42, shuffle=True)


In [6]:
# -------- Cell 6: Model Definition + Device --------

# Create a fresh model for each fold
def create_model():
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(unique_labels),
        id2label=id2label,
        label2id=label2id
    )
    return model

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [7]:
# -------- Cell 7: Full K-Fold Training Loop --------

EPOCHS = 8
PATIENCE = 2

for fold, (train_idx, valid_idx) in enumerate(kfold.split(train)):
    print(f"\n==============================")
    print(f"===== FOLD {fold+1}/{NUM_FOLDS} =====")
    print(f"==============================")

    # ---- Split dataframe ----
    train_df = train.iloc[train_idx]
    valid_df = train.iloc[valid_idx]

    # ---- Create datasets ----
    train_ds = MAPDataset(train_df, tokenizer)
    valid_ds = MAPDataset(valid_df, tokenizer)

    # ---- Dataloaders ----
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=8, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=8, shuffle=False)

    print(f"Train batches: {len(train_loader)}, Valid batches: {len(valid_loader)}")

    # ---- Fresh model for this fold ----
    model = create_model()
    model.to(device)

    # ---- Optimizer ----
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    # ---- Best model tracking ----
    best_val_loss = float("inf")
    bad_epochs = 0

    # =====================================
    #           TRAINING EPOCHS
    # =====================================
    for epoch in range(EPOCHS):
        print(f"\n----- Epoch {epoch+1}/{EPOCHS} -----")

        # ===== TRAINING =====
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Training Fold {fold+1}, Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Train Loss: {avg_train_loss:.4f}")

        # ===== VALIDATION =====
        model.eval()
        valid_loss = 0

        with torch.no_grad():
            for batch in tqdm(valid_loader, desc="Validating"):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                valid_loss += outputs.loss.item()

        avg_val_loss = valid_loss / len(valid_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")

        # ===== CHECK IF BEST MODEL =====
        if avg_val_loss < best_val_loss:
            print("ðŸ”¥ New best model! Saving checkpoint...")
            best_val_loss = avg_val_loss
            bad_epochs = 0

            save_path = f"/kaggle/working/deberta_category_fold{fold+1}.pt"
            torch.save(model.state_dict(), save_path)
            print(f"Model saved to {save_path}")
        else:
            bad_epochs += 1
            print(f"No improvement ({bad_epochs}/{PATIENCE} bad epochs)")

        # ===== EARLY STOPPING =====
        if bad_epochs >= PATIENCE:
            print("â›” Early stopping triggered â€” stopping training for this fold.")
            break

print("\nðŸ”¥ All folds completed! Models saved in /kaggle/working/")


===== FOLD 1/5 =====
Train batches: 3670, Valid batches: 918


2025-12-11 11:34:58.530487: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765452898.679772      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765452898.720793      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]


----- Epoch 1/8 -----


Training Fold 1, Epoch 1:   0%|          | 1/3670 [00:00<50:03,  1.22it/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Training Fold 1, Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:31<00:00,  6.42it/s]


Train Loss: 0.6757


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.54it/s]


Validation Loss: 0.4479
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold1.pt

----- Epoch 2/8 -----


Training Fold 1, Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:29<00:00,  6.44it/s]


Train Loss: 0.4217


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:43<00:00, 21.17it/s]


Validation Loss: 0.3836
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold1.pt

----- Epoch 3/8 -----


Training Fold 1, Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:29<00:00,  6.44it/s]


Train Loss: 0.3497


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.46it/s]


Validation Loss: 0.3625
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold1.pt

----- Epoch 4/8 -----


Training Fold 1, Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:29<00:00,  6.44it/s]


Train Loss: 0.2955


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.42it/s]


Validation Loss: 0.3930
No improvement (1/2 bad epochs)

----- Epoch 5/8 -----


Training Fold 1, Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:29<00:00,  6.44it/s]


Train Loss: 0.2469


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.51it/s]


Validation Loss: 0.3635
No improvement (2/2 bad epochs)
â›” Early stopping triggered â€” stopping training for this fold.

===== FOLD 2/5 =====
Train batches: 3670, Valid batches: 918

----- Epoch 1/8 -----


Training Fold 2, Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:30<00:00,  6.43it/s]


Train Loss: 0.6516


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:43<00:00, 21.27it/s]


Validation Loss: 0.4269
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold2.pt

----- Epoch 2/8 -----


Training Fold 2, Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:26<00:00,  6.48it/s]


Train Loss: 0.4206


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.70it/s]


Validation Loss: 0.3887
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold2.pt

----- Epoch 3/8 -----


Training Fold 2, Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:26<00:00,  6.48it/s]


Train Loss: 0.3569


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.71it/s]


Validation Loss: 0.3812
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold2.pt

----- Epoch 4/8 -----


Training Fold 2, Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:25<00:00,  6.49it/s]


Train Loss: 0.3088


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.55it/s]


Validation Loss: 0.3789
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold2.pt

----- Epoch 5/8 -----


Training Fold 2, Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:26<00:00,  6.48it/s]


Train Loss: 0.2657


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.70it/s]


Validation Loss: 0.3662
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold2.pt

----- Epoch 6/8 -----


Training Fold 2, Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:25<00:00,  6.49it/s]


Train Loss: 0.2314


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.77it/s]


Validation Loss: 0.3737
No improvement (1/2 bad epochs)

----- Epoch 7/8 -----


Training Fold 2, Epoch 7: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:26<00:00,  6.48it/s]


Train Loss: 0.1949


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:43<00:00, 21.29it/s]


Validation Loss: 0.3946
No improvement (2/2 bad epochs)
â›” Early stopping triggered â€” stopping training for this fold.

===== FOLD 3/5 =====
Train batches: 3670, Valid batches: 918

----- Epoch 1/8 -----


Training Fold 3, Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:30<00:00,  6.43it/s]


Train Loss: 0.6581


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.68it/s]


Validation Loss: 0.4491
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold3.pt

----- Epoch 2/8 -----


Training Fold 3, Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:27<00:00,  6.47it/s]


Train Loss: 0.4209


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.41it/s]


Validation Loss: 0.3964
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold3.pt

----- Epoch 3/8 -----


Training Fold 3, Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:27<00:00,  6.47it/s]


Train Loss: 0.3602


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.77it/s]


Validation Loss: 0.3825
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold3.pt

----- Epoch 4/8 -----


Training Fold 3, Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:25<00:00,  6.49it/s]


Train Loss: 0.3074


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.70it/s]


Validation Loss: 0.3806
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold3.pt

----- Epoch 5/8 -----


Training Fold 3, Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:26<00:00,  6.47it/s]


Train Loss: 0.2667


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.59it/s]


Validation Loss: 0.4001
No improvement (1/2 bad epochs)

----- Epoch 6/8 -----


Training Fold 3, Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:28<00:00,  6.45it/s]


Train Loss: 0.2239


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.64it/s]


Validation Loss: 0.4504
No improvement (2/2 bad epochs)
â›” Early stopping triggered â€” stopping training for this fold.

===== FOLD 4/5 =====
Train batches: 3670, Valid batches: 918

----- Epoch 1/8 -----


Training Fold 4, Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:28<00:00,  6.45it/s]


Train Loss: 0.6624


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.60it/s]


Validation Loss: 0.4437
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold4.pt

----- Epoch 2/8 -----


Training Fold 4, Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:28<00:00,  6.45it/s]


Train Loss: 0.4142


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.56it/s]


Validation Loss: 0.3990
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold4.pt

----- Epoch 3/8 -----


Training Fold 4, Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:28<00:00,  6.46it/s]


Train Loss: 0.3441


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.61it/s]


Validation Loss: 0.3932
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold4.pt

----- Epoch 4/8 -----


Training Fold 4, Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:28<00:00,  6.45it/s]


Train Loss: 0.2944


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.51it/s]


Validation Loss: 0.3532
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold4.pt

----- Epoch 5/8 -----


Training Fold 4, Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:28<00:00,  6.46it/s]


Train Loss: 0.2508


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.70it/s]


Validation Loss: 0.3912
No improvement (1/2 bad epochs)

----- Epoch 6/8 -----


Training Fold 4, Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:28<00:00,  6.46it/s]


Train Loss: 0.2112


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.66it/s]


Validation Loss: 0.4078
No improvement (2/2 bad epochs)
â›” Early stopping triggered â€” stopping training for this fold.

===== FOLD 5/5 =====
Train batches: 3670, Valid batches: 918

----- Epoch 1/8 -----


Training Fold 5, Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:27<00:00,  6.47it/s]


Train Loss: 0.6857


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.76it/s]


Validation Loss: 0.4800
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold5.pt

----- Epoch 2/8 -----


Training Fold 5, Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:25<00:00,  6.49it/s]


Train Loss: 0.4176


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.75it/s]


Validation Loss: 0.4424
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold5.pt

----- Epoch 3/8 -----


Training Fold 5, Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:25<00:00,  6.49it/s]


Train Loss: 0.3587


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.76it/s]


Validation Loss: 0.4291
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold5.pt

----- Epoch 4/8 -----


Training Fold 5, Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:26<00:00,  6.48it/s]


Train Loss: 0.3101


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.66it/s]


Validation Loss: 0.3705
ðŸ”¥ New best model! Saving checkpoint...
Model saved to /kaggle/working/deberta_category_fold5.pt

----- Epoch 5/8 -----


Training Fold 5, Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:27<00:00,  6.46it/s]


Train Loss: 0.2655


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.60it/s]


Validation Loss: 0.3730
No improvement (1/2 bad epochs)

----- Epoch 6/8 -----


Training Fold 5, Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3670/3670 [09:27<00:00,  6.46it/s]


Train Loss: 0.2309


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 918/918 [00:42<00:00, 21.55it/s]

Validation Loss: 0.4067
No improvement (2/2 bad epochs)
â›” Early stopping triggered â€” stopping training for this fold.

ðŸ”¥ All folds completed! Models saved in /kaggle/working/



