In [1]:
!nvidia-smi

Mon Dec  1 12:50:14 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   59C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import os

BASE_DIR = "/content/semeval2026-task13"
os.makedirs(f"{BASE_DIR}/src", exist_ok=True)
os.makedirs(f"{BASE_DIR}/data", exist_ok=True)

os.listdir(BASE_DIR)

['src', 'data']

In [3]:
!pip install torch transformers pandas pyarrow scikit-learn tqdm



In [4]:
!pip install -q transformers accelerate scikit-learn


In [5]:
import pandas as pd

df = pd.read_parquet("/content/semeval2026-task13/data/task_a_training_set_1.parquet")
df.head()
df.columns


Index(['code', 'generator', 'label', 'language'], dtype='object')

In [7]:
# BLOCK 0: installs + imports
!pip install -q transformers accelerate scikit-learn

import os
import random
import numpy as np
import pandas as pd

from dataclasses import dataclass
from typing import Dict

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModel

# Base paths
BASE_DIR = "/content/semeval2026-task13"
DATA_DIR = f"{BASE_DIR}/data"

os.makedirs(f"{BASE_DIR}/src", exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)

print("Files in data/:", os.listdir(DATA_DIR))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Files in data/: ['task_c_training_set_1.parquet', 'task_a_training_set_1.parquet', 'task_c_validation_set.parquet', 'task_a_validation_set.parquet', 'task_a_test_set_sample.parquet', 'task_c_test_set_sample.parquet', 'task_b_validation_set.parquet', 'task_b_training_set.parquet', 'task_b_test_set_sample.parquet']
Using device: cuda


In [8]:
# load Task A parquet and prepare train/valid

a_train_path = f"{DATA_DIR}/task_a_training_set_1.parquet"
df_a_train = pd.read_parquet(a_train_path)

print("Task A train shape:", df_a_train.shape)
print("Task A train columns:", df_a_train.columns.tolist())
display(df_a_train.head())

#  columns
TEXT_COL  = "code"
LABEL_COL = "label"

#  only needed columns
df_a_train = df_a_train[[TEXT_COL, LABEL_COL]].dropna().reset_index(drop=True)

# Label mapping
labels = sorted(df_a_train[LABEL_COL].unique().tolist())
label2id = {lab: i for i, lab in enumerate(labels)}
id2label = {i: lab for lab, i in label2id.items()}
num_labels = len(labels)

df_a_train["label_id"] = df_a_train[LABEL_COL].map(label2id)

print("Num labels:", num_labels)

# Train/valid split
train_df, valid_df = train_test_split(
    df_a_train,
    test_size=0.1,
    random_state=42,
    stratify=df_a_train["label_id"],
)
print("Full Train size:", len(train_df), "Full Valid size:", len(valid_df))

# split
max_train = 4000
max_valid = 1000

if len(train_df) > max_train:
    train_df = train_df.sample(n=max_train, random_state=42)

if len(valid_df) > max_valid:
    valid_df = valid_df.sample(n=max_valid, random_state=42)

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

print("Using subset -> Train:", len(train_df), "Valid:", len(valid_df))



Task A train shape: (500000, 4)
Task A train columns: ['code', 'generator', 'label', 'language']


Unnamed: 0,code,generator,label,language
0,"(a, b, c, d) = [int(x) for x in input().split(...",human,0,Python
1,valid version for the language; all others can...,Qwen/Qwen2.5-Coder-1.5B,1,Python
2,python\ndef min_cards_to_flip(s):\n vowels ...,Qwen/Qwen2.5-Coder-7B-Instruct,1,Python
3,T = int(input())\nfor t in range(T):\n\tcolor ...,human,0,Python
4,for i in range(int(input())):\n\tinput()\n\ta ...,human,0,Python


Num labels: 2
Full Train size: 450000 Full Valid size: 50000
Using subset -> Train: 4000 Valid: 1000


In [9]:
# Dataset + model definition

class CodeDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df[TEXT_COL].tolist()
        self.labels = df["label_id"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item


class CodeBERTClassifier(nn.Module):
    def __init__(self, model_name, num_labels, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = out.last_hidden_state[:, 0]  # CLS token
        pooled = self.dropout(pooled)
        logits = self.classifier(pooled)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return {"loss": loss, "logits": logits}


In [10]:
# training helpers + config

@dataclass
class AblationConfig:
    name: str
    model_name: str
    max_len: int = 256
    lr: float = 2e-5
    epochs: int = 1
    batch_size: int = 16
    use_onecycle: bool = True
    use_balancing: bool = True
    dropout: float = 0.1


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def create_dataloaders(cfg: AblationConfig, tokenizer):
    train_dataset = CodeDataset(train_df, tokenizer, cfg.max_len)
    valid_dataset = CodeDataset(valid_df, tokenizer, cfg.max_len)

    # Data balancing via weighted sampler for that ablation
    if cfg.use_balancing:
        label_counts = train_df["label_id"].value_counts().to_dict()
        class_weights = {cls: 1.0 / cnt for cls, cnt in label_counts.items()}
        sample_weights = [class_weights[l] for l in train_df["label_id"]]
        sampler = WeightedRandomSampler(
            weights=sample_weights,
            num_samples=len(sample_weights),
            replacement=True,
        )
        train_loader = DataLoader(
            train_dataset,
            batch_size=cfg.batch_size,
            sampler=sampler,
        )
    else:
        train_loader = DataLoader(
            train_dataset,
            batch_size=cfg.batch_size,
            shuffle=True,
        )

    valid_loader = DataLoader(
        valid_dataset,
        batch_size=cfg.batch_size,
        shuffle=False,
    )
    return train_loader, valid_loader


def evaluate(model, dataloader):
    model.eval()
    total = 0
    correct = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop("labels")
            out = model(**batch)
            logits = out["logits"]
            preds = torch.argmax(logits, dim=-1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    acc = correct / total
    f1_macro = f1_score(all_labels, all_preds, average="macro")
    return acc, f1_macro


def train_and_eval(cfg: AblationConfig) -> Dict:
    print(f"\n===== Experiment: {cfg.name} =====")
    set_seed(42)
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    train_loader, valid_loader = create_dataloaders(cfg, tokenizer)

    model = CodeBERTClassifier(
        model_name=cfg.model_name,
        num_labels=num_labels,
        dropout=cfg.dropout,
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr)

    if cfg.use_onecycle:
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=cfg.lr,
            steps_per_epoch=len(train_loader),
            epochs=cfg.epochs,
        )
    else:
        scheduler = None  # constant LR

    best_acc = 0.0
    best_f1 = 0.0

    for epoch in range(cfg.epochs):
        model.train()
        total_loss = 0.0
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop("labels")

            optimizer.zero_grad()
            out = model(**batch, labels=labels)
            loss = out["loss"]
            loss.backward()
            optimizer.step()
            if scheduler is not None:
                scheduler.step()

            total_loss += loss.item()

            if step % 100 == 0:
                print(f"  step {step}/{len(train_loader)}", end="\r")

        avg_loss = total_loss / max(1, len(train_loader))
        val_acc, val_f1 = evaluate(model, valid_loader)
        best_acc = max(best_acc, val_acc)
        best_f1 = max(best_f1, val_f1)

        print(
            f"Epoch {epoch+1}/{cfg.epochs} | "
            f"Loss {avg_loss:.4f} | "
            f"Val Acc {val_acc:.4f} | Val F1 {val_f1:.4f}"
        )

    return {
        "experiment": cfg.name,
        "encoder": cfg.model_name,
        "max_len": cfg.max_len,
        "use_onecycle": cfg.use_onecycle,
        "use_balancing": cfg.use_balancing,
        "best_val_acc": best_acc,
        "best_val_f1": best_f1,
    }


In [11]:
# define ablation experiments

# Baseline encoder
CODEBERT = "microsoft/codebert-base"
# Distilled encoder proxy for DistilCodeBERT
DISTIL_CODEBERT = "huggingface/CodeBERTa-small-v1"

EXPERIMENTS = [
    # codeBERT, max_len=256, OneCycle, balanced
    AblationConfig(
        name="baseline_codebert_256_onecycle_balanced",
        model_name=CODEBERT,
        max_len=256,
        use_onecycle=True,
        use_balancing=True,
    ),
    # Encoder variants- Distilled encoder vs CodeBERT
    AblationConfig(
        name="distilcodeberta_256_onecycle_balanced",
        model_name=DISTIL_CODEBERT,
        max_len=256,
        use_onecycle=True,
        use_balancing=True,
    ),
    # Input length - 128 tokens vs 256
    AblationConfig(
        name="codebert_128_onecycle_balanced",
        model_name=CODEBERT,
        max_len=128,
        use_onecycle=True,
        use_balancing=True,
    ),
    # Training strategy- constant LR vs OneCycle
    AblationConfig(
        name="codebert_256_constantLR_balanced",
        model_name=CODEBERT,
        max_len=256,
        use_onecycle=False,   # constant LR
        use_balancing=True,
    ),
    # Data balancing- no sampler vs sampler
    AblationConfig(
        name="codebert_256_onecycle_unbalanced",
        model_name=CODEBERT,
        max_len=256,
        use_onecycle=True,
        use_balancing=False,  # no WeightedRandomSampler
    ),
]


In [12]:
# running all ablation experiments

all_results = []
for cfg in EXPERIMENTS:
    res = train_and_eval(cfg)
    all_results.append(res)

results_df = pd.DataFrame(all_results)
print("\nAblation summary:")
display(results_df)

# save to CSV
csv_path = f"{BASE_DIR}/ablation_taskA.csv"
results_df.to_csv(csv_path, index=False)
print("Saved ablation results to:", csv_path)



===== Experiment: baseline_codebert_256_onecycle_balanced =====
Epoch 1/1 | Loss 0.2704 | Val Acc 0.9550 | Val F1 0.9549

===== Experiment: distilcodeberta_256_onecycle_balanced =====


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

  step 0/250

model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

Epoch 1/1 | Loss 0.2325 | Val Acc 0.9590 | Val F1 0.9589

===== Experiment: codebert_128_onecycle_balanced =====
Epoch 1/1 | Loss 0.2813 | Val Acc 0.9460 | Val F1 0.9459

===== Experiment: codebert_256_constantLR_balanced =====
Epoch 1/1 | Loss 0.1913 | Val Acc 0.9630 | Val F1 0.9628

===== Experiment: codebert_256_onecycle_unbalanced =====
Epoch 1/1 | Loss 0.2890 | Val Acc 0.9540 | Val F1 0.9538

Ablation summary:


Unnamed: 0,experiment,encoder,max_len,use_onecycle,use_balancing,best_val_acc,best_val_f1
0,baseline_codebert_256_onecycle_balanced,microsoft/codebert-base,256,True,True,0.955,0.954892
1,distilcodeberta_256_onecycle_balanced,huggingface/CodeBERTa-small-v1,256,True,True,0.959,0.958857
2,codebert_128_onecycle_balanced,microsoft/codebert-base,128,True,True,0.946,0.945938
3,codebert_256_constantLR_balanced,microsoft/codebert-base,256,False,True,0.963,0.962802
4,codebert_256_onecycle_unbalanced,microsoft/codebert-base,256,True,False,0.954,0.953845


Saved ablation results to: /content/semeval2026-task13/ablation_taskA.csv


In [14]:
# load Task B parquet and prepare train/valid

b_train_path = f"{DATA_DIR}/task_b_training_set.parquet"
df_b_train = pd.read_parquet(b_train_path)

print("Task B train shape:", df_b_train.shape)
print("Task B train columns:", df_b_train.columns.tolist())
display(df_b_train.head())

# Assuming same columns
TEXT_COL  = "code"
LABEL_COL = "label"

# only needed columns
df_b_train = df_b_train[[TEXT_COL, LABEL_COL]].dropna().reset_index(drop=True)

# Label mapping
labels = sorted(df_b_train[LABEL_COL].unique().tolist())
label2id = {lab: i for i, lab in enumerate(labels)}
id2label = {i: lab for lab, i in label2id.items()}
num_labels = len(labels)

df_b_train["label_id"] = df_b_train[LABEL_COL].map(label2id)

print("Num labels (Task B):", num_labels)

# Train/valid split
train_df, valid_df = train_test_split(
    df_b_train,
    test_size=0.1,
    random_state=42,
    stratify=df_b_train["label_id"],
)
print("Full Train size (B):", len(train_df), "Full Valid size (B):", len(valid_df))

max_train = 4000
max_valid = 1000

if len(train_df) > max_train:
    train_df = train_df.sample(n=max_train, random_state=42)

if len(valid_df) > max_valid:
    valid_df = valid_df.sample(n=max_valid, random_state=42)

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

print("Using subset (Task B) -> Train:", len(train_df), "Valid:", len(valid_df))



Task B train shape: (500000, 4)
Task B train columns: ['code', 'generator', 'label', 'language']


Unnamed: 0,code,generator,label,language
0,"def load(config, filepath, token):\n if con...",Human,0,Python
1,"n = int(input())\narr = list(map(int, input()....",Human,0,Python
2,using Aow.Infrastructure.Domain;\nusing Aow.In...,GPT-4o,10,C#
3,"def save_data(bot, force=False):\n if bot.d...",Human,0,Python
4,"def parse_metadata(metaurl, progress=1e5):\n ...",Human,0,Python


Num labels (Task B): 11
Full Train size (B): 450000 Full Valid size (B): 50000
Using subset (Task B) -> Train: 4000 Valid: 1000


In [15]:
# run all ablation experiments for Task B

all_results_b = []
for cfg in EXPERIMENTS:
    res = train_and_eval(cfg)
    all_results_b.append(res)

results_b_df = pd.DataFrame(all_results_b)
print("\nTask B Ablation summary:")
display(results_b_df)

csv_path_b = f"{BASE_DIR}/ablation_taskB.csv"
results_b_df.to_csv(csv_path_b, index=False)
print("Saved Task B ablation results to:", csv_path_b)



===== Experiment: baseline_codebert_256_onecycle_balanced =====
Epoch 1/1 | Loss 1.8632 | Val Acc 0.4970 | Val F1 0.1116

===== Experiment: distilcodeberta_256_onecycle_balanced =====
Epoch 1/1 | Loss 1.5860 | Val Acc 0.5990 | Val F1 0.1673

===== Experiment: codebert_128_onecycle_balanced =====
Epoch 1/1 | Loss 1.9395 | Val Acc 0.4330 | Val F1 0.1123

===== Experiment: codebert_256_constantLR_balanced =====
Epoch 1/1 | Loss 1.6444 | Val Acc 0.5690 | Val F1 0.1479

===== Experiment: codebert_256_onecycle_unbalanced =====
Epoch 1/1 | Loss 0.7476 | Val Acc 0.9030 | Val F1 0.0863

Task B Ablation summary:


Unnamed: 0,experiment,encoder,max_len,use_onecycle,use_balancing,best_val_acc,best_val_f1
0,baseline_codebert_256_onecycle_balanced,microsoft/codebert-base,256,True,True,0.497,0.111606
1,distilcodeberta_256_onecycle_balanced,huggingface/CodeBERTa-small-v1,256,True,True,0.599,0.167291
2,codebert_128_onecycle_balanced,microsoft/codebert-base,128,True,True,0.433,0.112296
3,codebert_256_constantLR_balanced,microsoft/codebert-base,256,False,True,0.569,0.147883
4,codebert_256_onecycle_unbalanced,microsoft/codebert-base,256,True,False,0.903,0.086275


Saved Task B ablation results to: /content/semeval2026-task13/ablation_taskB.csv


In [16]:
# load Task C parquet and prepare train/valid

c_train_path = f"{DATA_DIR}/task_c_training_set_1.parquet"
df_c_train = pd.read_parquet(c_train_path)

print("Task C train shape:", df_c_train.shape)
print("Task C train columns:", df_c_train.columns.tolist())
display(df_c_train.head())

TEXT_COL  = "code"
LABEL_COL = "label"

# only needed columns
df_c_train = df_c_train[[TEXT_COL, LABEL_COL]].dropna().reset_index(drop=True)

# Label mapping
labels = sorted(df_c_train[LABEL_COL].unique().tolist())
label2id = {lab: i for i, lab in enumerate(labels)}
id2label = {i: lab for lab, i in label2id.items()}
num_labels = len(labels)

df_c_train["label_id"] = df_c_train[LABEL_COL].map(label2id)

print("Num labels (Task C):", num_labels)

# Train/valid split
train_df, valid_df = train_test_split(
    df_c_train,
    test_size=0.1,
    random_state=42,
    stratify=df_c_train["label_id"],
)
print("Full Train size (C):", len(train_df), "Full Valid size (C):", len(valid_df))


max_train = 4000
max_valid = 1000

if len(train_df) > max_train:
    train_df = train_df.sample(n=max_train, random_state=42)

if len(valid_df) > max_valid:
    valid_df = valid_df.sample(n=max_valid, random_state=42)

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

print("Using subset (Task C) -> Train:", len(train_df), "Valid:", len(valid_df))



Task C train shape: (900000, 4)
Task C train columns: ['code', 'generator', 'label', 'language']


Unnamed: 0,code,generator,label,language
0,"import React, { useState, useEffect } from 're...",GPT-4o,1,JavaScript
1,<?php\n\nuse Fedeisas\LaravelJsRoutes\Commands...,Human,0,PHP
2,const math = require('mathjs');\n\n/**\n * Cal...,google/codegemma-7b-it,1,JavaScript
3,import org.junit.jupiter.params.provider.Argum...,GPT-4o,1,Java
4,// CodeForces\n\n// C. Alice and the Cake\n\n\...,Human,0,Java


Num labels (Task C): 4
Full Train size (C): 810000 Full Valid size (C): 90000
Using subset (Task C) -> Train: 4000 Valid: 1000


In [17]:
# running all ablation experiments for Task C

all_results_c = []
for cfg in EXPERIMENTS:
    res = train_and_eval(cfg)
    all_results_c.append(res)

results_c_df = pd.DataFrame(all_results_c)
print("\nTask C Ablation summary:")
display(results_c_df)

csv_path_c = f"{BASE_DIR}/ablation_taskC.csv"
results_c_df.to_csv(csv_path_c, index=False)
print("Saved Task C ablation results to:", csv_path_c)



===== Experiment: baseline_codebert_256_onecycle_balanced =====
Epoch 1/1 | Loss 1.2107 | Val Acc 0.6560 | Val F1 0.5457

===== Experiment: distilcodeberta_256_onecycle_balanced =====
Epoch 1/1 | Loss 1.0704 | Val Acc 0.6980 | Val F1 0.6079

===== Experiment: codebert_128_onecycle_balanced =====
Epoch 1/1 | Loss 1.2552 | Val Acc 0.6250 | Val F1 0.5016

===== Experiment: codebert_256_constantLR_balanced =====
Epoch 1/1 | Loss 1.1573 | Val Acc 0.6320 | Val F1 0.5377

===== Experiment: codebert_256_onecycle_unbalanced =====
Epoch 1/1 | Loss 0.9836 | Val Acc 0.7130 | Val F1 0.3928

Task C Ablation summary:


Unnamed: 0,experiment,encoder,max_len,use_onecycle,use_balancing,best_val_acc,best_val_f1
0,baseline_codebert_256_onecycle_balanced,microsoft/codebert-base,256,True,True,0.656,0.545658
1,distilcodeberta_256_onecycle_balanced,huggingface/CodeBERTa-small-v1,256,True,True,0.698,0.607938
2,codebert_128_onecycle_balanced,microsoft/codebert-base,128,True,True,0.625,0.501627
3,codebert_256_constantLR_balanced,microsoft/codebert-base,256,False,True,0.632,0.537707
4,codebert_256_onecycle_unbalanced,microsoft/codebert-base,256,True,False,0.713,0.392768


Saved Task C ablation results to: /content/semeval2026-task13/ablation_taskC.csv
