In [None]:
!pip -q install transformers accelerate datasets evaluate

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
import numpy as np
import pandas as pd
import torch
import numpy as np
import torch
from datasets import Dataset
from pathlib import Path
import joblib

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)


In [None]:
#data already downloaded using 01_data_overview_task_ab.ipynb

ROOT = Path("/content/semeval_task13")

#handling extra folder layer if present
if (ROOT / "SemEval-2026-Task13").exists():
    BASE_DIR = ROOT / "SemEval-2026-Task13"
else:
    BASE_DIR = ROOT

TASK_A_DIR = BASE_DIR / "task_a"

print("BASE_DIR:", BASE_DIR)
print("TASK_A_DIR:", TASK_A_DIR)
print("Files in task_a:")
for p in TASK_A_DIR.iterdir():
    print(" -", p.name)


In [None]:
#load subtask A data

train_path = TASK_A_DIR / "task_a_training_set_1.parquet"
val_path   = TASK_A_DIR / "task_a_validation_set.parquet"
test_path  = TASK_A_DIR / "task_a_test_set_sample.parquet"

df_train = pd.read_parquet(train_path)
df_val   = pd.read_parquet(val_path)
df_test  = pd.read_parquet(test_path)

print("Train shape:", df_train.shape)
print("Val shape  :", df_val.shape)
print("Test shape :", df_test.shape)
print("\nColumns:", df_train.columns.tolist())

df_train.head()


In [None]:
#basic preprocessing
df_train = df_train[["code", "label"]].dropna()
df_val   = df_val[["code", "label"]].dropna()

df_train["label"] = df_train["label"].astype(int)
df_val["label"]   = df_val["label"].astype(int)

print(df_train.head())
print(df_val.head())



In [None]:
#building HF daatset
train_ds_gcb = Dataset.from_pandas(df_train[["code", "label"]], preserve_index=False)
val_ds_gcb   = Dataset.from_pandas(df_val[["code", "label"]],   preserve_index=False)
test_ds_gcb  = Dataset.from_pandas(df_test[["code"]],           preserve_index=False)

print(train_ds_gcb)
print(val_ds_gcb)
print(test_ds_gcb)

#GraphCodeBERT Tokenizer
gcb_model_name = "microsoft/graphcodebert-base"
gcb_tokenizer  = AutoTokenizer.from_pretrained(gcb_model_name)

#ensure pad token
if gcb_tokenizer.pad_token is None:
    gcb_tokenizer.pad_token = gcb_tokenizer.eos_token or gcb_tokenizer.cls_token

max_length = 256

def gcb_tokenize_fn(batch):
    return gcb_tokenizer(
        batch["code"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

#map tokenizer
train_tok_gcb = train_ds_gcb.map(gcb_tokenize_fn, batched=True, remove_columns=["code"])
val_tok_gcb   = val_ds_gcb.map(gcb_tokenize_fn,   batched=True, remove_columns=["code"])
test_tok_gcb  = test_ds_gcb.map(gcb_tokenize_fn,  batched=True, remove_columns=["code"])

train_tok_gcb = train_tok_gcb.rename_column("label", "labels")
val_tok_gcb   = val_tok_gcb.rename_column("label", "labels")

#set torch format
train_tok_gcb.set_format(type="torch")
val_tok_gcb.set_format(type="torch")
test_tok_gcb.set_format(type="torch")


In [None]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

num_labels = 2
gcb_model = AutoModelForSequenceClassification.from_pretrained(
    gcb_model_name,
    num_labels=num_labels,
    problem_type="single_label_classification",
)

#set pad token id
gcb_model.config.pad_token_id = gcb_tokenizer.pad_token_id

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    macro_f1 = metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    acc = (preds == labels).mean()
    return {"macro_f1": macro_f1, "accuracy": acc}

batch_size = 16

gcb_training_args = TrainingArguments(
    output_dir="task_a_graphcodebert", #temporary folder
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=200,
    save_steps=2000,
    logging_dir="logs_graphcodebert",
)

gcb_trainer = Trainer(
    model=gcb_model,
    args=gcb_training_args,
    train_dataset=train_tok_gcb,
    eval_dataset=val_tok_gcb,
    tokenizer=gcb_tokenizer,
    compute_metrics=compute_metrics,
)

#Training
gcb_trainer.train()

#evaluation on validation set
gcb_eval_results = gcb_trainer.evaluate(eval_dataset=val_tok_gcb)
print("GraphCodeBERT Validation results:", gcb_eval_results)


In [None]:
#detailed results
gcb_val_outputs = gcb_trainer.predict(val_tok_gcb)
gcb_val_logits  = gcb_val_outputs.predictions             # [N_val, num_labels]
gcb_val_preds   = np.argmax(gcb_val_logits, axis=-1)      # predicted class ids

y_val_true = df_val["label"].astype(int).values

gcb_val_acc = accuracy_score(y_val_true, gcb_val_preds)
gcb_val_f1  = f1_score(y_val_true, gcb_val_preds, average="macro")

print(f"\nGraphCodeBERT Validation Accuracy : {gcb_val_acc:.4f}")
print(f"GraphCodeBERT Validation Macro F1 : {gcb_val_f1:.4f}")

#Confusion matrix & report
gcb_cm = confusion_matrix(y_val_true, gcb_val_preds)
print("\nGraphCodeBERT Confusion Matrix (rows=true, cols=pred):")
print(gcb_cm)

print("\nGraphCodeBERT Classification Report:")
print(classification_report(y_val_true, gcb_val_preds, digits=4))

#probabilities (softmax over logits)
gcb_val_probs = torch.softmax(torch.from_numpy(gcb_val_logits), dim=-1).numpy()

print("\nGraphCodeBERT First 5 (true, pred, prob_0, prob_1):")
for i in range(5):
    print(
        f"i={i:3d} | y={y_val_true[i]} | Å·={gcb_val_preds[i]} "
        f"| p0={gcb_val_probs[i,0]:.4f} | p1={gcb_val_probs[i,1]:.4f}"
    )


np.save("val_probs_graphcodebert.npy", gcb_val_probs)#for ensemble
print("\nsaved for ensemble:val_probs_graphcodebert.npy")




In [None]:
#test predictions for GraphCodeBERT
test_tok_gcb.set_format(type="torch")

gcb_test_outputs = gcb_trainer.predict(test_tok_gcb)
gcb_test_logits  = gcb_test_outputs.predictions

#probabilities for each class (for ensemble)
gcb_test_probs = torch.softmax(torch.from_numpy(gcb_test_logits), dim=-1).numpy()
gcb_test_preds = np.argmax(gcb_test_logits, axis=-1).astype(int)

print("GraphCodeBERT Test predictions shape:", gcb_test_preds.shape)
print("GraphCodeBERT Test probs shape      :", gcb_test_probs.shape)
print("First 10 GraphCodeBERT predictions :", gcb_test_preds[:10])

np.save("test_probs_graphcodebert.npy", gcb_test_probs)

print("\nSaved for ensemble:test_probs_graphcodebert.npy")



In [None]:
#smaple submission for the competition
sample_filename = "sample_submission_a.csv"
print("Loaded file:", sample_filename)

sample_sub = pd.read_csv(sample_filename)

print("Sample submission shape:", sample_sub.shape)
print("First rows of sample submission:")
print(sample_sub.head())
print("Number of test predictions:", len(test_preds))

# 3) Fill label column and save
if len(sample_sub) != len(test_preds):
    print("Length mismatch: sample_sub rows:", len(sample_sub), "| test_preds:", len(test_preds))
else:
    if "label" in sample_sub.columns:
        label_col = "label"
    else:
        label_col = sample_sub.columns[1]

    sample_sub[label_col] = test_preds

    print("\nSubmission preview:")
    print(sample_sub.head())

    sub_path = "subtask_a_graphcodebert_final.csv"
    sample_sub.to_csv(sub_path, index=False)
    print(f"\nSaved submission file: {sub_path}")