In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv
/kaggle/input/map-charting-student-math-misunderstandings/train.csv
/kaggle/input/map-charting-student-math-misunderstandings/test.csv


In [2]:
# -------- Cell 2: Install + Imports --------

#!pip install -q transformers accelerate

import os
import sys
import pandas as pd
import torch
from sklearn.model_selection import KFold
from tqdm import tqdm
import warnings

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import transformers

warnings.filterwarnings("ignore")
transformers.logging.set_verbosity_error()

print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("CUDA available:", torch.cuda.is_available())

Torch: 2.6.0+cu124
Transformers: 4.53.3
CUDA available: True


In [3]:
# -------- Cell 3: Preprocess Functions --------
import pandas as pd
import re

def clean_text(s):
    """Basic cleaning for TF-IDF (unused for transformers but included for completeness)."""
    if pd.isna(s):
        return ""
    s = str(s)

    s = re.sub(r'[,!?;"\'\[\]\{\}]', ' ', s)
    s = re.sub(r"\s+", " ", s).strip()

    return s


def build_text_column(df):
    """
    (Not used for DeBERTa)
    """
    q = df["QuestionText"].fillna("").apply(clean_text)
    a = df["MC_Answer"].fillna("").apply(clean_text)
    e = df["StudentExplanation"].fillna("").apply(clean_text)

    df["text"] = q + " " + a + " " + e
    return df


def build_text_columns_bert(df):
    """
    Builds the combined 'text' column for transformer models (DeBERTa/BERT).
    Raw text only â€” no cleaning to avoid losing information.
    """
    df["Misconception"] = df["Misconception"].fillna("NA").astype(str)

    q = df["QuestionText"].fillna("").astype(str)
    a = df["MC_Answer"].fillna("").astype(str)
    e = df["StudentExplanation"].fillna("").astype(str)

    df["text"] = (q + " " + a + " " + e).str.strip()
    return df

In [4]:
# -------- Cell 4: Load Data + Preprocess + Label Encoding --------

# Load train.csv from Kaggle dataset
train_path = "/kaggle/input/map-charting-student-math-misunderstandings/train.csv"
train = pd.read_csv(train_path)

# Build combined text for transformers
train = build_text_columns_bert(train)

# Misconception: fill NA â†’ then remove NA rows
train["Misconception"] = train["Misconception"].fillna("NA").astype(str)
train = train[train["Misconception"] != "NA"]   # remove NA rows

# Sort unique label names (~32 labels)
unique_labels = sorted(train["Misconception"].unique())

label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Add numeric label column
train["label"] = train["Misconception"].map(label2id)

print("Total misconception labels:", len(unique_labels))
train.head()

Total misconception labels: 35


Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception,text,label
107,107,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),3 out of 9 parts aren't shaded.,True_Misconception,Incomplete,What fraction of the shape is not shaded? Give...,12
120,120,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),3 out of 9 triangles are not shaded.,True_Misconception,Incomplete,What fraction of the shape is not shaded? Give...,12
123,123,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),3 out of nine triangles are not shaded,True_Misconception,Incomplete,What fraction of the shape is not shaded? Give...,12
265,265,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),3/9 of the shape isn't shaded,True_Misconception,Incomplete,What fraction of the shape is not shaded? Give...,12
518,518,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),Because there are 6 triangles and 3 are white....,True_Misconception,WNB,What fraction of the shape is not shaded? Give...,29


In [5]:
# -------- Cell 5: Tokenizer, Dataset Class, KFold --------

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import KFold
from tqdm import tqdm

# --- 1. Load Tokenizer (from HuggingFace) ---
MODEL_NAME = "microsoft/deberta-v3-small"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer loaded!")

# --- 2. Dataset Class ---
class MAPDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# --- 3. K-Fold Setup ---
NUM_FOLDS = 5
kfold = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

print("KFold ready:", kfold)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Tokenizer loaded!
KFold ready: KFold(n_splits=5, random_state=42, shuffle=True)


In [6]:
# -------- Cell 6: Model Creation + Device --------

def create_model():
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(unique_labels),   # ~32 misconception classes
        id2label=id2label,
        label2id=label2id
    )
    return model

# Device selection (GPU on Kaggle)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [7]:
# -------- Cell 7: Full K-Fold Training Loop (Misconception) --------

EPOCHS = 8
PATIENCE = 2   # stop after 2 bad epochs (no improvement)

for fold, (train_idx, valid_idx) in enumerate(kfold.split(train)):
    print(f"\n==============================")
    print(f"===== MISCONCEPTION FOLD {fold+1}/{NUM_FOLDS} =====")
    print(f"==============================")

    # ---- Split dataframe ----
    train_df = train.iloc[train_idx]
    valid_df = train.iloc[valid_idx]

    # ---- Create datasets ----
    train_ds = MAPDataset(train_df, tokenizer)
    valid_ds = MAPDataset(valid_df, tokenizer)

    # ---- Dataloaders ----
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=8, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=8, shuffle=False)

    print(f"Train batches: {len(train_loader)}, Valid batches: {len(valid_loader)}")

    # ---- Fresh model for this fold ----
    model = create_model()
    model.to(device)

    # ---- Optimizer ----
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    best_val_loss = float("inf")
    bad_epochs = 0

    # =====================================
    #           TRAINING EPOCHS
    # =====================================
    for epoch in range(EPOCHS):
        print(f"\n----- Epoch {epoch+1}/{EPOCHS} -----")

        # ===== TRAINING =====
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Training Fold {fold+1}, Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Train Loss: {avg_train_loss:.4f}")

        # ===== VALIDATION =====
        model.eval()
        valid_loss = 0

        with torch.no_grad():
            for batch in tqdm(valid_loader, desc="Validating"):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                valid_loss += outputs.loss.item()

        avg_val_loss = valid_loss / len(valid_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")

        # ===== BEST MODEL SAVE =====
        if avg_val_loss < best_val_loss:
            print("ðŸ”¥ New BEST model for this fold! Saving...")
            best_val_loss = avg_val_loss
            bad_epochs = 0

            save_path = f"/kaggle/working/deberta_misconception_fold{fold+1}.pt"
            torch.save(model.state_dict(), save_path)
            print(f"Saved checkpoint: {save_path}")

        else:
            bad_epochs += 1
            print(f"No improvement ({bad_epochs}/{PATIENCE})")

        # ===== EARLY STOPPING =====
        if bad_epochs >= PATIENCE:
            print("â›” Early stopping triggered for this fold!")
            break

print("\nðŸ”¥ ALL MISCONCEPTION FOLDS COMPLETED!")
print("Your model files are in /kaggle/working/")


===== MISCONCEPTION FOLD 1/5 =====
Train batches: 986, Valid batches: 247


2025-12-11 12:05:35.795751: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765454736.105622      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765454736.200258      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]


----- Epoch 1/8 -----


Training Fold 1, Epoch 1:   0%|          | 0/986 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Training Fold 1, Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:34<00:00,  6.38it/s]


Train Loss: 1.0120


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.31it/s]


Validation Loss: 0.3286
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold1.pt

----- Epoch 2/8 -----


Training Fold 1, Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.45it/s]


Train Loss: 0.2498


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.54it/s]


Validation Loss: 0.2140
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold1.pt

----- Epoch 3/8 -----


Training Fold 1, Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.45it/s]


Train Loss: 0.1615


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.55it/s]


Validation Loss: 0.1461
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold1.pt

----- Epoch 4/8 -----


Training Fold 1, Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.45it/s]


Train Loss: 0.1114


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.42it/s]


Validation Loss: 0.1234
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold1.pt

----- Epoch 5/8 -----


Training Fold 1, Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.45it/s]


Train Loss: 0.0906


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.62it/s]


Validation Loss: 0.1281
No improvement (1/2)

----- Epoch 6/8 -----


Training Fold 1, Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.45it/s]


Train Loss: 0.0734


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.60it/s]


Validation Loss: 0.1284
No improvement (2/2)
â›” Early stopping triggered for this fold!

===== MISCONCEPTION FOLD 2/5 =====
Train batches: 986, Valid batches: 247

----- Epoch 1/8 -----


Training Fold 2, Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:33<00:00,  6.44it/s]


Train Loss: 1.0174


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.21it/s]


Validation Loss: 0.4003
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold2.pt

----- Epoch 2/8 -----


Training Fold 2, Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.45it/s]


Train Loss: 0.2735


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.62it/s]


Validation Loss: 0.2133
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold2.pt

----- Epoch 3/8 -----


Training Fold 2, Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:33<00:00,  6.44it/s]


Train Loss: 0.1738


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.53it/s]


Validation Loss: 0.1488
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold2.pt

----- Epoch 4/8 -----


Training Fold 2, Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.45it/s]


Train Loss: 0.1241


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.38it/s]


Validation Loss: 0.1303
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold2.pt

----- Epoch 5/8 -----


Training Fold 2, Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:33<00:00,  6.44it/s]


Train Loss: 0.1016


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.50it/s]


Validation Loss: 0.1255
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold2.pt

----- Epoch 6/8 -----


Training Fold 2, Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.45it/s]


Train Loss: 0.0800


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.56it/s]


Validation Loss: 0.1476
No improvement (1/2)

----- Epoch 7/8 -----


Training Fold 2, Epoch 7: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:33<00:00,  6.43it/s]


Train Loss: 0.0627


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.23it/s]


Validation Loss: 0.1639
No improvement (2/2)
â›” Early stopping triggered for this fold!

===== MISCONCEPTION FOLD 3/5 =====
Train batches: 986, Valid batches: 247

----- Epoch 1/8 -----


Training Fold 3, Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:33<00:00,  6.43it/s]


Train Loss: 1.0515


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.54it/s]


Validation Loss: 0.3777
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold3.pt

----- Epoch 2/8 -----


Training Fold 3, Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:33<00:00,  6.44it/s]


Train Loss: 0.3117


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.43it/s]


Validation Loss: 0.1773
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold3.pt

----- Epoch 3/8 -----


Training Fold 3, Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:33<00:00,  6.43it/s]


Train Loss: 0.1700


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.53it/s]


Validation Loss: 0.1345
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold3.pt

----- Epoch 4/8 -----


Training Fold 3, Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:33<00:00,  6.43it/s]


Train Loss: 0.1309


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.45it/s]


Validation Loss: 0.1607
No improvement (1/2)

----- Epoch 5/8 -----


Training Fold 3, Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.46it/s]


Train Loss: 0.0980


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.74it/s]


Validation Loss: 0.1251
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold3.pt

----- Epoch 6/8 -----


Training Fold 3, Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.46it/s]


Train Loss: 0.0816


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.53it/s]


Validation Loss: 0.1226
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold3.pt

----- Epoch 7/8 -----


Training Fold 3, Epoch 7: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.48it/s]


Train Loss: 0.0686


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.66it/s]


Validation Loss: 0.1159
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold3.pt

----- Epoch 8/8 -----


Training Fold 3, Epoch 8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.48it/s]


Train Loss: 0.0518


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.60it/s]


Validation Loss: 0.1309
No improvement (1/2)

===== MISCONCEPTION FOLD 4/5 =====
Train batches: 986, Valid batches: 247

----- Epoch 1/8 -----


Training Fold 4, Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.47it/s]


Train Loss: 1.0171


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.76it/s]


Validation Loss: 0.3356
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold4.pt

----- Epoch 2/8 -----


Training Fold 4, Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.47it/s]


Train Loss: 0.2762


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.75it/s]


Validation Loss: 0.2005
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold4.pt

----- Epoch 3/8 -----


Training Fold 4, Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.46it/s]


Train Loss: 0.1630


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.82it/s]


Validation Loss: 0.1632
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold4.pt

----- Epoch 4/8 -----


Training Fold 4, Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.48it/s]


Train Loss: 0.1212


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.73it/s]


Validation Loss: 0.1292
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold4.pt

----- Epoch 5/8 -----


Training Fold 4, Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.47it/s]


Train Loss: 0.0890


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.80it/s]


Validation Loss: 0.1137
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold4.pt

----- Epoch 6/8 -----


Training Fold 4, Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.48it/s]


Train Loss: 0.0745


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.76it/s]


Validation Loss: 0.1130
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold4.pt

----- Epoch 7/8 -----


Training Fold 4, Epoch 7: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.47it/s]


Train Loss: 0.0611


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.78it/s]


Validation Loss: 0.1031
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold4.pt

----- Epoch 8/8 -----


Training Fold 4, Epoch 8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.48it/s]


Train Loss: 0.0406


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.75it/s]


Validation Loss: 0.1099
No improvement (1/2)

===== MISCONCEPTION FOLD 5/5 =====
Train batches: 986, Valid batches: 247

----- Epoch 1/8 -----


Training Fold 5, Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.47it/s]


Train Loss: 1.0756


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.73it/s]


Validation Loss: 0.3078
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold5.pt

----- Epoch 2/8 -----


Training Fold 5, Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.49it/s]


Train Loss: 0.2708


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.52it/s]


Validation Loss: 0.1787
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold5.pt

----- Epoch 3/8 -----


Training Fold 5, Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.48it/s]


Train Loss: 0.1677


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.76it/s]


Validation Loss: 0.1690
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold5.pt

----- Epoch 4/8 -----


Training Fold 5, Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.46it/s]


Train Loss: 0.1222


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.71it/s]


Validation Loss: 0.1446
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold5.pt

----- Epoch 5/8 -----


Training Fold 5, Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.48it/s]


Train Loss: 0.0978


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.76it/s]


Validation Loss: 0.1319
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold5.pt

----- Epoch 6/8 -----


Training Fold 5, Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:32<00:00,  6.46it/s]


Train Loss: 0.0814


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.22it/s]


Validation Loss: 0.1903
No improvement (1/2)

----- Epoch 7/8 -----


Training Fold 5, Epoch 7: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:33<00:00,  6.42it/s]


Train Loss: 0.0666


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.20it/s]


Validation Loss: 0.1247
ðŸ”¥ New BEST model for this fold! Saving...
Saved checkpoint: /kaggle/working/deberta_misconception_fold5.pt

----- Epoch 8/8 -----


Training Fold 5, Epoch 8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 986/986 [02:33<00:00,  6.43it/s]


Train Loss: 0.0523


Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 247/247 [00:11<00:00, 21.21it/s]

Validation Loss: 0.1396
No improvement (1/2)

ðŸ”¥ ALL MISCONCEPTION FOLDS COMPLETED!
Your model files are in /kaggle/working/



