In [None]:
!pip -q install transformers datasets accelerate

In [None]:
import numpy as np
from datasets import Dataset
from pathlib import Path
import os
import pandas as pd
import torch
from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import(
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

In [None]:
ROOT = Path("/content/semeval_task13")

if (ROOT / "SemEval-2026-Task13").exists():
    BASE_DIR = ROOT / "SemEval-2026-Task13"
else:
    BASE_DIR = ROOT

TASK_DIR = BASE_DIR / "task_b"

print("BASE_DIR:", BASE_DIR)
print("TASK_DIR:", TASK_DIR)
print("Files in task_b:")
for p in TASK_DIR.iterdir():
    print(" -", p.name)

In [None]:

train_path = TASK_DIR / "task_b_training_set.parquet"
val_path  = TASK_DIR / "task_b_validation_set.parquet"
test_path = TASK_DIR / "task_b_test_set_sample.parquet"

df_train = pd.read_parquet(train_path)
df_val   = pd.read_parquet(val_path)
df_test  = pd.read_parquet(test_path)

print("Train shape:", df_train.shape)
print("Val shape  :", df_val.shape)
print("Test shape :", df_test.shape)
print("\nTrain columns:", df_train.columns.tolist())
print("Val columns  :", df_val.columns.tolist())
print("Test columns :", df_test.columns.tolist())

df_train.head()


In [None]:
#label mapping
unique_labels = sorted(df_train["label"].unique())
label2id_orig_to_idx = {lbl: i for i, lbl in enumerate(unique_labels)}
id2label_idx_to_orig = {i: lbl for lbl, i in label2id_orig_to_idx.items()}
num_labels = len(unique_labels)

print("Number of labels:", num_labels)
print("Original label -> index mapping:", label2id_orig_to_idx)

df_train = df_train.copy()
df_val   = df_val.copy()
df_test  = df_test.copy()

df_train["labels"] = df_train["label"].map(label2id_orig_to_idx).astype(int)
df_val["labels"]   = df_val["label"].map(label2id_orig_to_idx).astype(int)


#tokenizer + HF Datasets
MODEL_NAME = "microsoft/unixcoder-base"
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

MAX_LENGTH = 256

def make_hf_datasets(df_train_local, df_val_local, df_test_local, max_length=256):
    def tokenize_batch(batch):
        return tokenizer(
            batch["code"],
            padding="max_length",
            truncation=True,
            max_length=max_length,
        )

    train_ds = Dataset.from_pandas(df_train_local[["code", "labels"]])
    val_ds   = Dataset.from_pandas(df_val_local[["code", "labels"]])
    test_ds  = Dataset.from_pandas(df_test_local[["code"]])

    train_tok = train_ds.map(tokenize_batch, batched=True, remove_columns=["code"])
    val_tok   = val_ds.map(tokenize_batch,   batched=True, remove_columns=["code"])
    test_tok  = test_ds.map(tokenize_batch,  batched=True, remove_columns=["code"])

    train_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    val_tok.set_format(type="torch",   columns=["input_ids", "attention_mask", "labels"])
    test_tok.set_format(type="torch",  columns=["input_ids", "attention_mask"])

    return train_tok, val_tok, test_tok

train_tok, val_tok, test_tok = make_hf_datasets(df_train, df_val, df_test, max_length=MAX_LENGTH)

#metric function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "macro_f1": f1}

#load the model
id2label_cfg = {i: str(id2label_idx_to_orig[i]) for i in id2label_idx_to_orig}
label2id_cfg = {str(v): k for k, v in id2label_idx_to_orig.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label_cfg,
    label2id=label2id_cfg,
)

#TrainingArguments
batch_size = 8

training_args = TrainingArguments(
    output_dir="/content/task_b_unixcoder_runs",   #temporary
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=200,
    save_steps=10_000_000,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#train and evaluation
trainer.train()
eval_results = trainer.evaluate(eval_dataset=val_tok)
print("UniXcoder Dev metrics:", eval_results)

dev_logits = trainer.predict(val_tok).predictions
dev_preds  = np.argmax(dev_logits, axis=-1)
y_true     = df_val["labels"].values

print("\nClassification report (UniXcoder, index labels):")
print(classification_report(y_true, dev_preds, digits=3))

#test predictions
test_logits = trainer.predict(test_tok).predictions
test_preds_idx = np.argmax(test_logits, axis=-1).astype(int)

print("Number of test predictions:", len(test_preds_idx))
print("First 10 test_preds (indices):", test_preds_idx[:10])

# Map back to original label IDs using your mapping (for submission)
test_labels_mapped = [int(id2label_idx_to_orig[i]) for i in test_preds_idx]
print("First 10 mapped labels (original IDs):", test_labels_mapped[:10])



In [None]:
from google.colab import files
import pandas as pd

print("upload the sample submission file")
uploaded = files.upload()

sample_filename = next(iter(uploaded.keys()))
print("Loaded file:", sample_filename)

sample_sub = pd.read_csv(sample_filename)

print("Sample submission shape:", sample_sub.shape)
print("First rows of sample submission:")
print(sample_sub.head())
print("Number of test predictions:", len(test_labels_mapped))

# Safety check
if len(sample_sub) != len(test_labels_mapped):
    print("Length mismatch: sample_sub rows:", len(sample_sub), "| preds:", len(test_labels_mapped))
else:
    label_col = "label"
    sample_sub[label_col] = test_labels_mapped

    print("\nSubmission preview:")
    print(sample_sub.head())

    sub_path = "subtask_b_unixcoder.csv"
    sample_sub.to_csv(sub_path, index=False)
    print(f"\nsaved submission file: {sub_path}")


In [None]:
#validation probablity(for ensemble)
dev_pred = trainer.predict(val_tok)
dev_logits = dev_pred.predictions
dev_probs  = softmax(torch.tensor(dev_logits), dim=-1).cpu().numpy()

y_true = df_val["labels"].values
dev_preds = dev_probs.argmax(axis=1)

acc = accuracy_score(y_true, dev_preds)
f1  = f1_score(y_true, dev_preds, average="macro")
print("Val Accuracy:", acc)
print("Val Macro-F1:", f1)

#test probablity
test_pred = trainer.predict(test_tok)
test_logits = test_pred.predictions
test_probs  = softmax(torch.tensor(test_logits), dim=-1).cpu().numpy()


In [None]:
MODEL_KEY = "unixcoder"

SAVE_ROOT = Path("/content/drive/MyDrive/semeval_task13_probs/task_b")
SAVE_ROOT.mkdir(parents=True, exist_ok=True)

np.save(SAVE_ROOT / f"{MODEL_KEY}_dev_probs.npy",  dev_probs)
np.save(SAVE_ROOT / f"{MODEL_KEY}_test_probs.npy", test_probs)

print("Saved probs for", MODEL_KEY, "to", SAVE_ROOT)
