In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
import numpy as np
import pandas as pd
import torch
from pathlib import Path
import joblib

from datasets import Dataset

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)


In [None]:
ROOT = Path("/content/drive/MyDrive/semeval_task13")

#handle extra folder layer if present
BASE_DIR = ROOT / "SemEval-2026-Task13" if (ROOT / "SemEval-2026-Task13").exists() else ROOT
TASK_A_DIR = BASE_DIR / "task_a"

print("BASE_DIR:", BASE_DIR)
print("TASK_A_DIR:", TASK_A_DIR)

if not TASK_A_DIR.exists():
    raise FileNotFoundError(f"task_a folder not found at: {TASK_A_DIR}")

print("Files in task_a:")
for p in TASK_A_DIR.iterdir():
    print(" -", p.name)

In [None]:
train_path = TASK_A_DIR / "task_a_training_set_1.parquet"
val_path   = TASK_A_DIR / "task_a_validation_set.parquet"
test_path  = TASK_A_DIR / "task_a_test_set_sample.parquet"

df_train = pd.read_parquet(train_path)
df_val   = pd.read_parquet(val_path)
df_test  = pd.read_parquet(test_path)

In [None]:
df_train = df_train[["code", "label"]].dropna()
df_val   = df_val[["code", "label"]].dropna()

df_train["label"] = df_train["label"].astype(int)
df_val["label"]   = df_val["label"].astype(int)

print(df_train.head())
print(df_val.head())


In [None]:
train_ds_ux = Dataset.from_pandas(df_train[["code", "label"]], preserve_index=False)
val_ds_ux   = Dataset.from_pandas(df_val[["code", "label"]],   preserve_index=False)
test_ds_ux  = Dataset.from_pandas(df_test[["code"]],           preserve_index=False)

print(train_ds_ux)
print(val_ds_ux)
print(test_ds_ux)

#UniXcoder Tokenizer
ux_model_name = "microsoft/unixcoder-base"
ux_tokenizer  = AutoTokenizer.from_pretrained(ux_model_name)

#ensure pad token
if ux_tokenizer.pad_token is None:
    ux_tokenizer.pad_token = ux_tokenizer.eos_token or ux_tokenizer.cls_token

max_length = 256

def ux_tokenize_fn(batch):
    return ux_tokenizer(
        batch["code"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

#map tokenizer
train_tok_ux = train_ds_ux.map(ux_tokenize_fn, batched=True, remove_columns=["code"])
val_tok_ux   = val_ds_ux.map(ux_tokenize_fn,   batched=True, remove_columns=["code"])
test_tok_ux  = test_ds_ux.map(ux_tokenize_fn,  batched=True, remove_columns=["code"])


train_tok_ux = train_tok_ux.rename_column("label", "labels")
val_tok_ux   = val_tok_ux.rename_column("label", "labels")

#set torch format
train_tok_ux.set_format(type="torch")
val_tok_ux.set_format(type="torch")
test_tok_ux.set_format(type="torch")


In [None]:
SAVE_DIR = "/content/drive/MyDrive/semeval_outputs/unixcoder_tok"
train_tok_ux.save_to_disk(SAVE_DIR + "/train_tok_ux")
val_tok_ux.save_to_disk(SAVE_DIR + "/val_tok_ux")
test_tok_ux.save_to_disk(SAVE_DIR + "/test_tok_ux")
print("tokenized datasets saved to Drive")


In [None]:
import numpy as np
import torch
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score
import os

os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

#save the model for later use
SAVE_DIR = "/content/drive/MyDrive/semeval_task13_outputs_unixcoder"
os.makedirs(SAVE_DIR, exist_ok=True)
print("Saving to:", SAVE_DIR)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    macro_f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"macro_f1": macro_f1, "accuracy": acc}


#Unixcoder transformer model
num_labels = 2
ux_model = AutoModelForSequenceClassification.from_pretrained(
    ux_model_name,
    num_labels=num_labels,
    problem_type="single_label_classification",
)
ux_model.config.pad_token_id = ux_tokenizer.pad_token_id

batch_size = 8  #not using 16 beacuse of resource limitation


ux_training_args = TrainingArguments(
    output_dir=SAVE_DIR,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=200,

    save_steps=2000,
    save_total_limit=2,
    logging_dir=f"{SAVE_DIR}/logs",
    report_to="none",
)

ux_trainer = Trainer(
    model=ux_model,
    args=ux_training_args,
    train_dataset=train_tok_ux,
    eval_dataset=val_tok_ux,
    tokenizer=ux_tokenizer,
    compute_metrics=compute_metrics,
)

ux_trainer.train()

ux_eval_results = ux_trainer.evaluate(eval_dataset=val_tok_ux)
print("UniXcoder Validation results:", ux_eval_results)

print("Saved in Drive at:", SAVE_DIR)


In [None]:
#Validation predictions for UniXcoder
ux_val_outputs = ux_trainer.predict(val_tok_ux)
ux_val_logits  = ux_val_outputs.predictions                # [N_val, num_labels]
ux_val_preds   = np.argmax(ux_val_logits, axis=-1)

y_val_true = df_val["label"].astype(int).values

ux_val_acc = accuracy_score(y_val_true, ux_val_preds)
ux_val_f1  = f1_score(y_val_true, ux_val_preds, average="macro")

print(f"\nUniXcoder Validation Accuracy : {ux_val_acc:.4f}")
print(f"UniXcoder Validation Macro F1 : {ux_val_f1:.4f}")

#Confusion matrix & report
ux_cm = confusion_matrix(y_val_true, ux_val_preds)
print("\nUniXcoder Confusion Matrix (rows=true, cols=pred):")
print(ux_cm)

print("\nUniXcoder Classification Report:")
print(classification_report(y_val_true, ux_val_preds, digits=4))

#Probabilities (softmax over logits)
ux_val_probs = torch.softmax(torch.from_numpy(ux_val_logits), dim=-1).numpy()

print("\n[UniXcoder] First 5 (true, pred, prob_0, prob_1):")
for i in range(5):
    print(
        f"i={i:3d} | y={y_val_true[i]} | Å·={ux_val_preds[i]} "
        f"| p0={ux_val_probs[i,0]:.4f} | p1={ux_val_probs[i,1]:.4f}"
    )

SAVE_DIR = "/content/drive/MyDrive/semeval_outputs/unixcoder"
os.makedirs(SAVE_DIR, exist_ok=True)

save_path = f"{SAVE_DIR}/val_probs_unixcoder.npy"
np.save(save_path, ux_val_probs)

print("\nSaved for ensemble:")
print(f"  - {save_path}")


In [None]:
#test predictions for UniXcoder
test_tok_ux.set_format(type="torch")

print("\nRunning UniXcoder prediction on test set")
ux_test_outputs = ux_trainer.predict(test_tok_ux)
ux_test_logits  = ux_test_outputs.predictions              # [N_test, num_labels]

#Probabilities for each class (for ensemble)
ux_test_probs = torch.softmax(torch.from_numpy(ux_test_logits), dim=-1).numpy()
ux_test_preds = np.argmax(ux_test_logits, axis=-1).astype(int)

print("UniXcoder Test predictions shape:", ux_test_preds.shape)
print("UniXcoder Test probs shape      :", ux_test_probs.shape)
print("First 10 UniXcoder predictions :", ux_test_preds[:10])


import os
import numpy as np

SAVE_DIR = "/content/drive/MyDrive/semeval_outputs/unixcoder"
os.makedirs(SAVE_DIR, exist_ok=True)

# save probs
probs_path = f"{SAVE_DIR}/test_probs_unixcoder.npy"
np.save(probs_path, ux_test_probs)

# save preds
preds_path = f"{SAVE_DIR}/test_preds_unixcoder_256_lang.npy"
np.save(preds_path, ux_test_preds)

print("\nSaved to Drive:")
print("  -", probs_path)
print("  -", preds_path)


In [None]:
#Test predictions for UniXcoder
test_tok_ux.set_format(type="torch")

print("\nRunning UniXcoder prediction on test set...")
ux_test_outputs = ux_trainer.predict(test_tok_ux)
ux_test_logits  = ux_test_outputs.predictions
ux_test_preds   = np.argmax(ux_test_logits, axis=-1)

print("UniXcoder Test predictions shape:", ux_test_preds.shape)
print("First 10 UniXcoder predictions:", ux_test_preds[:10])


In [None]:
sample_filename = "sample_submission_a.csv"
sample_sub_ux = pd.read_csv(sample_filename)

SAVE_DIR = "/content/drive/MyDrive/semeval_outputs/unixcoder"

os.makedirs(SAVE_DIR, exist_ok=True)

if len(sample_sub_ux) == len(ux_test_preds):
    sample_sub_ux["label"] = ux_test_preds

    sub_path_ux = f"{SAVE_DIR}/subtask_a_unixcoder.csv"
    sample_sub_ux.to_csv(sub_path_ux, index=False)

    print("\nSaved UniXcoder submission file to Drive:")
    print(sub_path_ux)
else:
    print("Length mismatch!")
