In [None]:
# ============================================================
# 0.  Imports & global switches
# ============================================================
import sys, os, json, warnings, math
import pandas as pd
import numpy  as np
import torch

from datasets            import Dataset
from huggingface_hub     import login
from sklearn.metrics     import accuracy_score
from transformers        import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    TrainerCallback,
    EarlyStoppingCallback,
)

# -- disable W&B completely
os.environ["WANDB_DISABLED"] = "true"
#warnings.filterwarnings("ignore")

In [None]:
# ============================================================
# 1.  Config
# ============================================================
ACCESS_TOKEN   = "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"   # change to your hugging face token
MODEL_NAME     = "microsoft/deberta-v3-large"
LABELS_CSV     = "/projects/pdd/IS2025_Podcast_Challenge/Labels/labels_consensus.csv"
TRANS_DIR      = "/projects/pdd/IS2025_Podcast_Challenge/Transcripts"

TEST_SPLIT     = 0.20
MAX_LEN        = 128
BATCH_SIZE     = 16
ACCUM_STEPS    = 8
LR             = 1e-6
EPOCHS         = 100
SAVE_DIR       = "./saved_deberta_model"  # where to save

In [None]:
# ============================================================
# 2.  Emotion mapping
# ============================================================
EMOTION2ID = {
    "A":0,"S":1,"H":2,"U":3,"F":4,
    "D":5,"C":6,"N":7,"O":8,"X":9,
}
ID2EMOTION = {v:k for k,v in EMOTION2ID.items()}

In [None]:
# ============================================================
# 3.  Utilities
# ============================================================
def load_transcript(fname:str) -> str:
    txt_path = os.path.join(TRANS_DIR, fname.replace(".wav", ".txt"))
    try:
        return open(txt_path, encoding="utf-8").read().strip()
    except Exception as e:
        print("Warrrrning", e)
        return ""

# ------------------------------------------------------------
class SaveOnImprove(TrainerCallback):
    """Save model + tokenizer when eval-loss improves."""
    def __init__(self, tokenizer, out_dir=SAVE_DIR, tag="FT6Large_epoch"):
        self.best = math.inf
        self.tok  = tokenizer
        self.dir  = out_dir
        self.tag  = tag
        self.saved_epochs = []
        os.makedirs(out_dir, exist_ok=True)

    def on_evaluate(self, args, state, control, metrics=None, **kw):
        loss  = metrics.get("eval_loss")
        epoch = int(state.epoch)
        if loss is not None and loss < self.best:
            self.best = loss
            ckpt_dir  = os.path.join(self.dir, f"{self.tag}_{epoch}")
            kw["model"].save_pretrained(ckpt_dir)
            self.tok.save_pretrained(ckpt_dir)
            self.saved_epochs.append(epoch)
            print(f"✅  Saved new best checkpoint at epoch {epoch} (eval_loss {loss:.4f})")
        return control

def compute_metrics(pred):
    logits, labels = pred
    return {"accuracy": accuracy_score(labels, logits.argmax(-1))}

In [None]:
# ============================================================
# 4.  Data pipeline
# ============================================================
def build_datasets():
    df   = pd.read_csv(LABELS_CSV)
    df   = df[df["Split_Set"].str.strip()=="Development"].copy()
    df["transcription"] = df["FileName"].apply(load_transcript)
    ds   = Dataset.from_pandas(df[["transcription","EmoClass"]])

    ds = ds.train_test_split(TEST_SPLIT, seed=42)
    train_ds, val_ds = ds["train"], ds["test"]

    def add_label(example):
        example["labels"] = EMOTION2ID[ example["EmoClass"] ]
        return example

    train_ds = train_ds.map(add_label).remove_columns("EmoClass")
    val_ds   = val_ds.map(add_label).remove_columns("EmoClass")
    return train_ds, val_ds

In [None]:
# ============================================================
# 5.  Main training part with gradient accumulation = 6
# ============================================================
def main():
    # --- login & device
    login(token=ACCESS_TOKEN)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # --- data
    train_ds, val_ds = build_datasets()

    # --- tokenizer & tokenisation
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, token=ACCESS_TOKEN)
    def tok_fn(ex): return tokenizer(
        ex["transcription"], truncation=True, padding="max_length", max_length=MAX_LEN
    )
    train_ds = train_ds.map(tok_fn, batched=True)
    val_ds   = val_ds.map(tok_fn, batched=True)

    # --- model
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=len(EMOTION2ID)
    ).to(device)

    # --- training args
    args = TrainingArguments(
        output_dir       = "./tmp_runs",
        num_train_epochs = 100,
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 6,
        learning_rate    = 1e-6,
        warmup_ratio     = 0.1,
        weight_decay     = 0.01,
        lr_scheduler_type= "cosine",
        fp16=True,
        logging_strategy = "epoch",
        eval_strategy = "epoch",
        save_strategy    = "epoch",      
        metric_for_best_model="eval_loss",
        load_best_model_at_end= True,
        greater_is_better=False,
        report_to        = ["none"],
    )

    # --- trainer
    saver_cb = SaveOnImprove(tokenizer)
    trainer  = Trainer(
        model          = model,
        args           = args,
        train_dataset  = train_ds,
        eval_dataset   = val_ds,
        data_collator  = DataCollatorWithPadding(tokenizer),
        compute_metrics= compute_metrics,
        callbacks      = [saver_cb, EarlyStoppingCallback(5)],
    )
    trainer.train()

    # --- recap
    print("\nTraining stopped at epoch", int(trainer.state.epoch))
    if saver_cb.saved_epochs:
        print("Best checkpoint epoch :", saver_cb.saved_epochs[-1])
    print("Evaluate final model :", trainer.evaluate())

if __name__ == "__main__":
    main()

In [None]:
# ============================================================
# 5.  Main training part with gradien accumulation = 8
# ============================================================
def main():
    # --- login & device
    login(token=ACCESS_TOKEN)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # --- data
    train_ds, val_ds = build_datasets()

    # --- tokenizer & tokenisation
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, token=ACCESS_TOKEN)
    def tok_fn(ex): return tokenizer(
        ex["transcription"], truncation=True, padding="max_length", max_length=MAX_LEN
    )
    train_ds = train_ds.map(tok_fn, batched=True)
    val_ds   = val_ds.map(tok_fn, batched=True)

    # --- model
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=len(EMOTION2ID)
    ).to(device)

    # --- training args
    args = TrainingArguments(
        output_dir       = "./tmp_runs",
        num_train_epochs = 100,
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 8,
        learning_rate    = 1e-6,
        warmup_ratio     = 0.1,
        weight_decay     = 0.01,
        lr_scheduler_type= "cosine",
        fp16=True,
        logging_strategy = "epoch",
        evaluation_strategy = "epoch",
        save_strategy    = "epoch",          # callback handles saving
        metric_for_best_model="eval_loss",
        load_best_model_at_end= True,
        greater_is_better=False,
        report_to        = ["none"],
    )

    # --- trainer
    saver_cb = SaveOnImprove(tokenizer)
    trainer  = Trainer(
        model          = model,
        args           = args,
        train_dataset  = train_ds,
        eval_dataset   = val_ds,
        data_collator  = DataCollatorWithPadding(tokenizer),
        compute_metrics= compute_metrics,
        callbacks      = [saver_cb, EarlyStoppingCallback(5)],
    )
    trainer.train()

    # --- recap
    print("\nTraining stopped at epoch", int(trainer.state.epoch))
    if saver_cb.saved_epochs:
        print("Best checkpoint epoch :", saver_cb.saved_epochs[-1])
    print("Evaluate final model :", trainer.evaluate())

if __name__ == "__main__":
    main()

In [None]:
# ------------------------------------------------------------------
# Demo of DeBERTa
# ------------------------------------------------------------------

In [None]:
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ------------------------------------------------------------------
# 0.  Define all checkpoints you want to compare
# ------------------------------------------------------------------
CHECKPOINTS = {
    "DeBERTa v3 base: batch_size=16, gradient_accum=8, lr=1e-6": "./saved_deberta_model/FT6_epoch_22",
    "DeBERTa v3 large: batch_size = 16, gradient_accumu=6, lr = 1e-6,": "./saved_deberta_model/FT7Large_epoch_9",
    "DeBERTa v3 large: batch_size=16, gradient_accum=8, lr=1e-6": "./saved_deberta_model/FT6Large_epoch_10",
}

MAX_LEN = 128                  # whatever you used during training
ID2EMOTION = {0:"A",1:"S",2:"H",3:"U",4:"F",5:"D",6:"C",7:"N",8:"O",9:"X"}

# ------------------------------------------------------------------
# 1.  Load every tokenizer+model pair once and cache them
# ------------------------------------------------------------------
models = {}
for name, path in CHECKPOINTS.items():
    tok = AutoTokenizer.from_pretrained(path, use_fast=False)
    mdl = AutoModelForSequenceClassification.from_pretrained(path).eval()
    models[name] = (tok, mdl)

# ------------------------------------------------------------------
# 2.  Prediction helper
# ------------------------------------------------------------------
def predict(text: str, tok, mdl):
    enc = tok(text,
              return_tensors="pt",
              padding="max_length",
              truncation=True,
              max_length=MAX_LEN)
    with torch.no_grad():
        logits = mdl(**enc).logits
    probs = F.softmax(logits, dim=-1).squeeze().tolist()
    return {
        "pred_emotion"    : ID2EMOTION[int(np.argmax(probs))],
        "confidence"      : float(max(probs)),
        "all_probabilities": probs,
    }


In [None]:
# 3. Present Results
sentence = "This is so frustrating—I’m tired of repeating myself!" # change sentence if you want
print(f"\n{'Model':<60} | {'Emotion':<7} | {'Confidence':<10}")
print("-" * 85)
for name, (tok, mdl) in models.items():
    out = predict(sentence, tok, mdl)
    print(f"{name:<60} | {out['pred_emotion']:<7} | {out['confidence']:<10.4f}")
    print("  Probabilities by emotion:")
    for i, prob in enumerate(out["all_probabilities"]):
        print(f"    {ID2EMOTION[i]}: {prob:.4f}")
    print("-" * 85)
