DAC ITS - 2025 viva LnT

In [None]:
!unzip /content/data-analysis-competition-2025.zip

Archive:  /content/data-analysis-competition-2025.zip
  inflating: Test/claude.csv         
  inflating: Test/deepseek.csv       
  inflating: Test/gemini.csv         
  inflating: Test/gpt.csv            
  inflating: Test/grok.csv           
  inflating: Test/perplexity.csv     
  inflating: Train/claude.csv        
  inflating: Train/deepseek.csv      
  inflating: Train/gemini.csv        
  inflating: Train/gpt.csv           
  inflating: Train/grok.csv          
  inflating: Train/perplexity.csv    
  inflating: sample_submission.csv   


In [None]:
import os, glob, re, random, gc
from pathlib import Path
import numpy as np, pandas as pd
import torch, torch.nn as nn, torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding,
                          EarlyStoppingCallback)

# Config
MODEL_NAME = "microsoft/deberta-v3-base"
OUT_DIR = "/content/deberta_max_t4_opt"
PRED_DIR = "/content/predictions_per_model_max_opt"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(PRED_DIR, exist_ok=True)
TRAIN_GLOBS = ["/content/Train/*.csv"]
TEST_GLOBS  = ["/content/Test/*.csv"]
MAX_LEN = 128            # 160 to 128
RND = 42
random.seed(RND); np.random.seed(RND); torch.manual_seed(RND)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True

def find_csvs(globs):
    p=[]
    for g in globs:
        p += glob.glob(g)
    return sorted([x for x in p if x.lower().endswith(".csv") and "sample" not in Path(x).stem.lower()])

def detect_text_col(df):
    for c in df.columns:
        if c.lower() in ("comment","text","review","body","content","message"):
            return c
    obj = [c for c in df.columns if df[c].dtype==object]
    return obj[0] if obj else df.columns[0]

def detect_label_col(df):
    for c in df.columns:
        if c.lower() in ("sentiment","label","target"):
            return c
    for c in df.columns:
        if np.issubdtype(df[c].dtype, np.integer):
            return c
    raise ValueError("No label column found")

def detect_id_col(df):
    for c in df.columns:
        if c.lower() in ("commentid","id","comment_id","comment id"):
            return c
    return None

# Load csv
train_paths = find_csvs(TRAIN_GLOBS)
if len(train_paths)==0:
    raise FileNotFoundError("No train CSVs found")
dfs=[]
for p in train_paths:
    df = pd.read_csv(p)
    tcol = detect_text_col(df)
    lcol = detect_label_col(df)
    df = df[[tcol, lcol]].rename(columns={tcol:"text", lcol:"label"})
    dfs.append(df)
all_train = pd.concat(dfs, ignore_index=True).dropna(subset=["text"]).reset_index(drop=True)
all_train["text"] = all_train["text"].astype(str)
all_train["label"] = all_train["label"].astype(int)

_url_re = re.compile(r"https?://\S+|www\.\S+")
_mention_re = re.compile(r"@\w+")
_nonprint_re = re.compile(r"[\r\n\t]")
_multi_space = re.compile(r"\s+")
_repeat_punct = re.compile(r"([!?\.]){2,}")

def clean_text(s):
    s = s or ""
    s = _url_re.sub(" ", s)
    s = _mention_re.sub(" ", s)
    s = _nonprint_re.sub(" ", s)
    s = _repeat_punct.sub(r"\1", s)
    s = _multi_space.sub(" ", s)
    return s.strip()

all_train["text"] = all_train["text"].map(clean_text)

df0 = all_train[all_train["label"]==0]
df1 = all_train[all_train["label"]==1]
df2 = all_train[all_train["label"]==2]

def synth_mixed(n):
    out=[]
    if len(df0)==0 or len(df2)==0:
        return pd.DataFrame([], columns=all_train.columns)
    for _ in range(n):
        a = df2.sample(1).iloc[0]["text"]
        b = df0.sample(1).iloc[0]["text"]
        txt = (a + " // " + b) if random.random()<0.5 else (b + " // " + a)
        out.append({"text": txt, "label": 1})
    return pd.DataFrame(out)

n_synth = min(1500, max(100, int(0.025 * len(all_train)))) # Fewer synthetic
synth_df = synth_mixed(n_synth)

target_neutral = int(0.12 * len(all_train))
if len(df1) < target_neutral:
    need = target_neutral - len(df1)
    ups = df1.sample(need, replace=True, random_state=RND) if len(df1)>0 else synth_df.sample(need, replace=True, random_state=RND)
    df1_aug = pd.concat([df1, ups], ignore_index=True)
else:
    df1_aug = df1

aug_df = pd.concat([df0, df1_aug, df2, synth_df], ignore_index=True).sample(frac=1.0, random_state=RND).reset_index(drop=True)
print("Counts after augmentation:", aug_df["label"].value_counts().to_dict())

MAX_TRAIN = 50000 # 65000 to 50000
if len(aug_df) > MAX_TRAIN:
    frac = MAX_TRAIN / len(aug_df)
    aug_df = aug_df.groupby("label", group_keys=False).apply(lambda g: g.sample(frac=frac, random_state=RND)).reset_index(drop=True)

train_df, val_df = train_test_split(aug_df, test_size=0.10, stratify=aug_df["label"], random_state=RND)
train_df = train_df.reset_index(drop=True); val_df = val_df.reset_index(drop=True)

print("Initializing tokenizer...", MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_texts(texts):
    return tokenizer(texts, truncation=True, padding=False, max_length=MAX_LEN) # padding=False saves compute & memory

train_enc = tokenize_texts(train_df["text"].tolist())
val_enc   = tokenize_texts(val_df["text"].tolist())

class TorchDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        item = {k: self.encodings[k][idx] for k in self.encodings}
        # Let DataCollator convert to tensors and pad
        if self.labels is not None:
            item["labels"] = int(self.labels[idx])
        return item

train_dataset = TorchDataset(train_enc, train_df["label"].tolist())
val_dataset = TorchDataset(val_enc, val_df["label"].tolist())

train_labels = np.array(train_df["label"].tolist())
num_labels = int(train_labels.max()) + 1
cw = compute_class_weight(class_weight="balanced", classes=np.arange(num_labels), y=train_labels)
cw = np.maximum(cw, 1e-6)
print("Class weights:", cw.round(4).tolist())

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction="mean"):
        super().__init__()
        self.gamma = gamma; self.reduction = reduction
        self.alpha = torch.tensor(alpha, dtype=torch.float) if alpha is not None else None
    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, reduction="none")
        p_t = torch.exp(-ce)
        loss = ((1 - p_t) ** self.gamma) * ce
        if self.alpha is not None:
            at = self.alpha.to(loss.device)[targets]
            loss = at * loss
        return loss.mean() if self.reduction=="mean" else loss.sum()

class MaxTrainer(Trainer):
    def __init__(self, use_focal=False, focal_gamma=2.0, focal_alpha=None, use_weighted_ce=False, weight_tensor=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.use_focal = use_focal
        self.focal_gamma = focal_gamma
        self.focal_alpha = focal_alpha
        self.use_weighted_ce = use_weighted_ce
        self.weight_tensor = torch.tensor(weight_tensor, dtype=torch.float) if weight_tensor is not None else None
        if self.use_focal:
            self.focal = FocalLoss(gamma=self.focal_gamma, alpha=self.focal_alpha)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels", None)
        if labels is None:
            raise KeyError("No labels found")
        if isinstance(labels, torch.Tensor):
            labels = labels.to(model.device)
        else:
            labels = torch.tensor(labels, device=model.device)
        if labels.dtype != torch.long:
            labels = labels.long()
        outputs = model(**{k:(v.to(model.device) if isinstance(v, torch.Tensor) else v) for k,v in inputs.items()})
        logits = outputs.logits
        if self.use_focal:
            loss = self.focal(logits, labels)
        else:
            if self.use_weighted_ce and (self.weight_tensor is not None):
                wt = self.weight_tensor.to(logits.device)
                loss = F.cross_entropy(logits, labels, weight=wt)
            else:
                loss = F.cross_entropy(logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    macro = float(f1_score(labels, preds, average="macro"))
    per_class = f1_score(labels, preds, average=None, labels=[0,1,2])
    try:
        crep = classification_report(labels, preds, digits=4, target_names=["0_neg","1_neu/mix","2_pos"])
    except Exception:
        crep = str(classification_report(labels, preds, digits=4))
    cm = confusion_matrix(labels, preds, labels=[0,1,2])
    print("\n===== Evaluation report (epoch) =====")
    print(crep)
    print("Per-class F1 (0,1,2):", np.round(per_class,4).tolist())
    print("Macro F1:", round(macro,4))
    print("3x3 Confusion matrix (rows=true, cols=pred):")
    print(cm)
    print("====================================\n")
    return {"macro_f1": macro, "f1_0": float(per_class[0]), "f1_1": float(per_class[1]), "f1_2": float(per_class[2])}

def safe_model_init(enable_checkpointing=False):
    m = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels, ignore_mismatched_sizes=True, low_cpu_mem_usage=True)
    if enable_checkpointing:
        try:
            m.gradient_checkpointing_enable()
        except Exception:
            pass
    return m

preferred_batch = 8
fallback_batch = 4
effective_epochs = 2

best_cfg = {"learning_rate":2e-5, "per_device_train_batch_size":preferred_batch, "gradient_accumulation_steps":1}
per_device_train_batch_size = best_cfg["per_device_train_batch_size"]
per_device_eval_batch_size = max(16, per_device_train_batch_size*2)

hpo_args = TrainingArguments(
    output_dir=os.path.join(OUT_DIR, "hpo"),
    num_train_epochs=1,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=best_cfg["gradient_accumulation_steps"],
    learning_rate=best_cfg["learning_rate"],
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="steps",
    logging_steps=200,
    weight_decay=0.01,
    fp16=True if DEVICE=="cuda" else False,
    dataloader_num_workers=4,
    report_to="none"
)

data_collator = DataCollatorWithPadding(tokenizer)
hpo_trainer = MaxTrainer(
    model_init=lambda: safe_model_init(enable_checkpointing=False),
    args=hpo_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    use_focal=False,
    use_weighted_ce=True,
    weight_tensor=cw.tolist()
)

print("Running quick HPO-validation pass to catch OOMs and check metrics")
try:
    hpo_trainer.train()
    hpo_res = hpo_trainer.evaluate()
    print("HPO eval:", hpo_res)
except RuntimeError as e:
    msg = str(e)
    print("HPO error:", msg[:300])
    if "out of memory" in msg.lower():
        print("OOM during HPO: lowering batch size and retrying with batch", fallback_batch)
        torch.cuda.empty_cache(); gc.collect()
        best_cfg["per_device_train_batch_size"] = fallback_batch
        per_device_train_batch_size = fallback_batch
        per_device_eval_batch_size = max(8, per_device_train_batch_size*2)
        hpo_args = hpo_args.__class__(**{k:getattr(hpo_args,k) for k in hpo_args.to_sanitized_dict()})
        hpo_args.per_device_train_batch_size = per_device_train_batch_size
        hpo_args.per_device_eval_batch_size = per_device_eval_batch_size
        hpo_trainer = MaxTrainer(
            model_init=lambda: safe_model_init(enable_checkpointing=False),
            args=hpo_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            use_focal=False,
            use_weighted_ce=True,
            weight_tensor=cw.tolist()
        )
        hpo_trainer.train(); hpo_res = hpo_trainer.evaluate(); print("HPO eval (retry):", hpo_res)

del hpo_trainer
torch.cuda.empty_cache(); gc.collect()

final_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=effective_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=best_cfg["gradient_accumulation_steps"],
    learning_rate=best_cfg["learning_rate"],
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=True if DEVICE=="cuda" else False,
    dataloader_num_workers=4,
    report_to="none"
)

def run_final(enable_checkpointing):
    trainer = MaxTrainer(
        model_init=lambda: safe_model_init(enable_checkpointing=enable_checkpointing),
        args=final_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        use_focal=False,
        use_weighted_ce=True,
        weight_tensor=cw.tolist()
    )
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))
    trainer.train()
    res = trainer.evaluate()
    return trainer, res

print("Starting final training with gradient checkpointing enabled (safer memory use).")
try:
    final_trainer, final_res = run_final(enable_checkpointing=True)
except RuntimeError as e:
    msg = str(e)
    print("Final training error:", msg[:300])
    if "backward through the graph a second time" in msg or "retain_graph" in msg:
        print("Retrying final training WITHOUT gradient checkpointing (workaround).")
        torch.cuda.empty_cache(); gc.collect()
        final_trainer, final_res = run_final(enable_checkpointing=False)
    elif "out of memory" in msg.lower():
        print("OOM on final training: lowering batch size and retrying.")
        torch.cuda.empty_cache(); gc.collect()
        final_args.per_device_train_batch_size = fallback_batch
        final_args.per_device_eval_batch_size = max(8, fallback_batch*2)
        final_trainer, final_res = run_final(enable_checkpointing=False)
    else:
        raise

print("Final eval:", final_res)
final_trainer.save_model(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)
test_paths = find_csvs(TEST_GLOBS)
if len(test_paths) == 0:
    print("No test CSVs found; skipping prediction.")
else:
    def predict_on_file(path, batch=64):
        df = pd.read_csv(path)
        tcol = detect_text_col(df)
        idcol = detect_id_col(df)
        texts = df[tcol].fillna("").astype(str).tolist()
        ids = df[idcol].astype(str).tolist() if idcol else [f"{Path(path).stem}_{i+1}" for i in range(len(texts))]
        preds = []
        for start in range(0, len(texts), 512):
            chunk = texts[start:start+512]
            enc = tokenizer(chunk, truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt")
            input_ids = enc["input_ids"].to(final_trainer.model.device)
            attention_mask = enc["attention_mask"].to(final_trainer.model.device)
            with torch.no_grad():
                logits = final_trainer.model(input_ids=input_ids, attention_mask=attention_mask).logits
                batch_preds = logits.detach().cpu().numpy().argmax(axis=-1).tolist()
            preds.extend(batch_preds)
        return pd.DataFrame({"CommentId": ids, "Sentiment": preds})

    saved = {}
    for p in test_paths:
        stem = Path(p).stem.lower()
        out_df = predict_on_file(p, batch=64)
        out_path = os.path.join(PRED_DIR, f"{stem}_preds.csv")
        out_df.to_csv(out_path, index=False)
        saved[stem] = out_path

    print("Saved per-model prediction CSVs:")
    for k,v in saved.items():
        print(" -", k, "->", v)

preds = final_trainer.predict(val_dataset)
y_pred = preds.predictions.argmax(axis=-1)
y_true = np.array(val_df["label"].tolist())
print("\nFINAL Validation report:")
print(classification_report(y_true, y_pred, digits=4, target_names=["0_neg","1_neu/mix","2_pos"]))
print("Macro F1:", f1_score(y_true, y_pred, average="macro"))
print("3x3 Confusion matrix (rows=true, cols=pred):")
print(confusion_matrix(y_true, y_pred, labels=[0,1,2]))

Counts after augmentation: {2: 107771, 0: 17520, 1: 17205}
Initializing tokenizer... microsoft/deberta-v3-base


  aug_df = aug_df.groupby("label", group_keys=False).apply(lambda g: g.sample(frac=frac, random_state=RND)).reset_index(drop=True)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Class weights: [2.711, 2.7609, 0.4407]


  super().__init__(*args, **kwargs)


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running quick HPO-validation pass to catch OOMs and check metrics


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1,F1 0,F1 1,F1 2
1,0.7824,0.809611,0.710097,0.729236,0.467327,0.933727



===== Evaluation report (epoch) =====
              precision    recall  f1-score   support

       0_neg     0.7453    0.7138    0.7292       615
   1_neu/mix     0.5813    0.3907    0.4673       604
       2_pos     0.9076    0.9614    0.9337      3781

    accuracy                         0.8620      5000
   macro avg     0.7447    0.6886    0.7101      5000
weighted avg     0.8482    0.8620    0.8522      5000

Per-class F1 (0,1,2): [0.7292, 0.4673, 0.9337]
Macro F1: 0.7101
3x3 Confusion matrix (rows=true, cols=pred):
[[ 439   74  102]
 [ 100  236  268]
 [  50   96 3635]]






===== Evaluation report (epoch) =====
              precision    recall  f1-score   support

       0_neg     0.7453    0.7138    0.7292       615
   1_neu/mix     0.5813    0.3907    0.4673       604
       2_pos     0.9076    0.9614    0.9337      3781

    accuracy                         0.8620      5000
   macro avg     0.7447    0.6886    0.7101      5000
weighted avg     0.8482    0.8620    0.8522      5000

Per-class F1 (0,1,2): [0.7292, 0.4673, 0.9337]
Macro F1: 0.7101
3x3 Confusion matrix (rows=true, cols=pred):
[[ 439   74  102]
 [ 100  236  268]
 [  50   96 3635]]

HPO eval: {'eval_loss': 0.8096112012863159, 'eval_macro_f1': 0.7100966052478, 'eval_f1_0': 0.729235880398671, 'eval_f1_1': 0.46732673267326735, 'eval_f1_2': 0.9337272026714616, 'eval_runtime': 11.0909, 'eval_samples_per_second': 450.818, 'eval_steps_per_second': 28.221, 'epoch': 1.0}
Starting final training with gradient checkpointing enabled (safer memory use).


  super().__init__(*args, **kwargs)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Final training error: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time o
Retrying final training WITHOUT gradient checkpointing (workaround).


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1,F1 0,F1 1,F1 2
1,0.7318,0.779966,0.679042,0.726817,0.418478,0.891829
2,0.6751,0.88314,0.722113,0.74148,0.490821,0.934038



===== Evaluation report (epoch) =====
              precision    recall  f1-score   support

       0_neg     0.7474    0.7073    0.7268       615
   1_neu/mix     0.3548    0.5099    0.4185       604
       2_pos     0.9208    0.8646    0.8918      3781

    accuracy                         0.8024      5000
   macro avg     0.6744    0.6939    0.6790      5000
weighted avg     0.8311    0.8024    0.8144      5000

Per-class F1 (0,1,2): [0.7268, 0.4185, 0.8918]
Macro F1: 0.679
3x3 Confusion matrix (rows=true, cols=pred):
[[ 435   91   89]
 [ 104  308  192]
 [  43  469 3269]]






===== Evaluation report (epoch) =====
              precision    recall  f1-score   support

       0_neg     0.7585    0.7252    0.7415       615
   1_neu/mix     0.5893    0.4205    0.4908       604
       2_pos     0.9106    0.9587    0.9340      3781

    accuracy                         0.8650      5000
   macro avg     0.7528    0.7015    0.7221      5000
weighted avg     0.8531    0.8650    0.8568      5000

Per-class F1 (0,1,2): [0.7415, 0.4908, 0.934]
Macro F1: 0.7221
3x3 Confusion matrix (rows=true, cols=pred):
[[ 446   68  101]
 [  95  254  255]
 [  47  109 3625]]






===== Evaluation report (epoch) =====
              precision    recall  f1-score   support

       0_neg     0.7585    0.7252    0.7415       615
   1_neu/mix     0.5893    0.4205    0.4908       604
       2_pos     0.9106    0.9587    0.9340      3781

    accuracy                         0.8650      5000
   macro avg     0.7528    0.7015    0.7221      5000
weighted avg     0.8531    0.8650    0.8568      5000

Per-class F1 (0,1,2): [0.7415, 0.4908, 0.934]
Macro F1: 0.7221
3x3 Confusion matrix (rows=true, cols=pred):
[[ 446   68  101]
 [  95  254  255]
 [  47  109 3625]]

Final eval: {'eval_loss': 0.8831400275230408, 'eval_macro_f1': 0.7221128364855595, 'eval_f1_0': 0.741479634247714, 'eval_f1_1': 0.49082125603864735, 'eval_f1_2': 0.9340376191703169, 'eval_runtime': 10.972, 'eval_samples_per_second': 455.706, 'eval_steps_per_second': 28.527, 'epoch': 2.0}
Saved per-model prediction CSVs:
 - claude -> /content/predictions_per_model_max_opt/claude_preds.csv
 - deepseek -> /content/p




===== Evaluation report (epoch) =====
              precision    recall  f1-score   support

       0_neg     0.7585    0.7252    0.7415       615
   1_neu/mix     0.5893    0.4205    0.4908       604
       2_pos     0.9106    0.9587    0.9340      3781

    accuracy                         0.8650      5000
   macro avg     0.7528    0.7015    0.7221      5000
weighted avg     0.8531    0.8650    0.8568      5000

Per-class F1 (0,1,2): [0.7415, 0.4908, 0.934]
Macro F1: 0.7221
3x3 Confusion matrix (rows=true, cols=pred):
[[ 446   68  101]
 [  95  254  255]
 [  47  109 3625]]


FINAL Validation report:
              precision    recall  f1-score   support

       0_neg     0.7585    0.7252    0.7415       615
   1_neu/mix     0.5893    0.4205    0.4908       604
       2_pos     0.9106    0.9587    0.9340      3781

    accuracy                         0.8650      5000
   macro avg     0.7528    0.7015    0.7221      5000
weighted avg     0.8531    0.8650    0.8568      5000

Macro F1:

In [None]:
import pandas as pd

file_paths = [
    "/content/predictions_per_model_max_opt/gpt_preds.csv",
    "/content/predictions_per_model_max_opt/claude_preds.csv",
    "/content/predictions_per_model_max_opt/deepseek_preds.csv",
    "/content/predictions_per_model_max_opt/gemini_preds.csv",
    "/content/predictions_per_model_max_opt/grok_preds.csv",
    "/content/predictions_per_model_max_opt/perplexity_preds.csv",
]

processed_dfs = []

for file_path in file_paths:
    df = pd.read_csv(file_path)
    model_name = file_path.split('/')[-1].split('_')[0]
    df['CommentId'] = model_name + '_' + df['CommentId'].astype(str)
    processed_dfs.append(df)

combined_df = pd.concat(processed_dfs).reset_index(drop=True)
model_order = ["gpt", "claude", "deepseek", "gemini", "grok", "perplexity"]
combined_df['model_name'] = combined_df['CommentId'].apply(lambda x: x.split('_')[0])
combined_df['numeric_id'] = combined_df['CommentId'].apply(lambda x: int(x.split('_')[1]))
combined_df['model_name'] = pd.Categorical(combined_df['model_name'], categories=model_order, ordered=True)
combined_df = combined_df.sort_values(by=['model_name', 'numeric_id']).reset_index(drop=True)
combined_df = combined_df.drop(columns=['model_name', 'numeric_id'])
combined_df.to_csv("DebertaOptimusPrime.csv", index=False)