In [None]:
!pip -q install transformers accelerate datasets evaluate

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
import torch
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
from datasets import Dataset

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)


In [None]:
#data already downloaded using 01_data_overview_task_ab.ipynb

ROOT = Path("/content/semeval_task13")

#handling extra folder layer if present
if (ROOT / "SemEval-2026-Task13").exists():
    BASE_DIR = ROOT / "SemEval-2026-Task13"
else:
    BASE_DIR = ROOT

TASK_A_DIR = BASE_DIR / "task_a"

print("BASE_DIR:", BASE_DIR)
print("TASK_A_DIR:", TASK_A_DIR)
print("Files in task_a:")
for p in TASK_A_DIR.iterdir():
    print(" -", p.name)


In [None]:
#load subtask A data

train_path = TASK_A_DIR / "task_a_training_set_1.parquet"
val_path   = TASK_A_DIR / "task_a_validation_set.parquet"
test_path  = TASK_A_DIR / "task_a_test_set_sample.parquet"

df_train = pd.read_parquet(train_path)
df_val   = pd.read_parquet(val_path)
df_test  = pd.read_parquet(test_path)

print("Train shape:", df_train.shape)
print("Val shape  :", df_val.shape)
print("Test shape :", df_test.shape)
print("\nColumns:", df_train.columns.tolist())

df_train.head()


In [None]:
#basic preprocessing
df_train = df_train[["code", "label"]].dropna()
df_val   = df_val[["code", "label"]].dropna()

df_train["label"] = df_train["label"].astype(int)
df_val["label"]   = df_val["label"].astype(int)

print(df_train.head())
print(df_val.head())



In [None]:
from datasets import Dataset

train_ds = Dataset.from_pandas(df_train[["code", "label"]], preserve_index=False)
val_ds   = Dataset.from_pandas(df_val[["code", "label"]], preserve_index=False)
test_ds  = Dataset.from_pandas(df_test[["code"]],          preserve_index=False)

print(train_ds)
print(val_ds)
print(test_ds)


In [None]:
from transformers import AutoTokenizer

model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

#pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token or tokenizer.cls_token

max_length = 256

def tokenize_fn(batch):
    return tokenizer(
        batch["code"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=["code"])
val_tok   = val_ds.map(tokenize_fn,   batched=True, remove_columns=["code"])
test_tok  = test_ds.map(tokenize_fn,  batched=True, remove_columns=["code"])

train_tok = train_tok.rename_column("label", "labels")
val_tok   = val_tok.rename_column("label", "labels")

train_tok.set_format(type="torch")
val_tok.set_format(type="torch")
test_tok.set_format(type="torch")


In [None]:
import numpy as np
import torch
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

num_labels = 2
model_name = "microsoft/codebert-base"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="single_label_classification",
)

#set pad token id
model.config.pad_token_id = tokenizer.pad_token_id

#macro F1 and accuracy
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    macro_f1 = metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    acc = (preds == labels).mean()
    return {"macro_f1": macro_f1, "accuracy": acc}

batch_size = 16

#Training arguments
training_args = TrainingArguments(
    output_dir="task_a_codebert",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=200,
    save_steps=2000,
    logging_dir="logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:

trainer.train()
eval_results = trainer.evaluate(eval_dataset=val_tok)
print("Validation results:", eval_results)

In [None]:
import numpy as np
import torch
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

#Prediction on validation set
val_outputs = trainer.predict(val_tok)
val_logits  = val_outputs.predictions            # [N_val, num_labels]
val_preds   = np.argmax(val_logits, axis=-1)     # predicted class ids
y_val_true = df_val["label"].astype(int).values

val_acc = accuracy_score(y_val_true, val_preds)
val_f1  = f1_score(y_val_true, val_preds, average="macro")

print(f"Validation Accuracy : {val_acc:.4f}")
print(f"Validation Macro F1 : {val_f1:.4f}")

#confusion matrix
cm = confusion_matrix(y_val_true, val_preds)
print("\nConfusion Matrix (rows=true, cols=pred):")
print(cm)

print("\nClassification Report:")
print(classification_report(y_val_true, val_preds, digits=4))

#probabilities (softmax over logits)
val_probs = torch.softmax(torch.from_numpy(val_logits), dim=-1).numpy()


print("\nFirst 5 (true, pred, prob_0, prob_1):")
for i in range(5):
    print(
        f"i={i:3d} | y={y_val_true[i]} | Å·={val_preds[i]} "
        f"| p0={val_probs[i,0]:.4f} | p1={val_probs[i,1]:.4f}"
    )



np.save("val_probs_codebert.npy", val_probs) #saved for ensemble later
np.save("val_labels.npy", y_val_true)

print("\nSaved for ensemble:- val_probs_codebert.npy and val_labels.npy")





In [None]:
from google.colab import drive
drive.mount('/content/drive')

save_dir = "/content/drive/MyDrive/semeval_task13_models/task_a_codebert_v1"

import os
os.makedirs(save_dir, exist_ok=True)

trainer.save_model(save_dir)          #saves fine-tuned model's weights and configurations
tokenizer.save_pretrained(save_dir)   #saves tokenizer

print("model and tokenizer saved to:", save_dir)


In [None]:
#prediction on test
test_tok.set_format(type="torch")

print("Running prediction on test set...")
test_outputs = trainer.predict(test_tok)
test_logits  = test_outputs.predictions

#probablities of each class(for ensemble model)
test_probs = torch.softmax(torch.from_numpy(test_logits), dim=-1).numpy()
test_preds = np.argmax(test_logits, axis=-1).astype(int)

print("Test predictions shape:", test_preds.shape)
print("Test probs shape      :", test_probs.shape)
print("First 10 predictions  :", test_preds[:10])

np.save("test_probs_codebert.npy", test_probs)

print("\nSaved for ensemble:test_probs_codebert.npy")


#submission into kaggle using sample submission file
sample_filename = "sample_submission_a.csv"
print("Loaded file:", sample_filename)
sample_sub = pd.read_csv(sample_filename)

print("Sample submission shape:", sample_sub.shape)
print("First rows of sample submission:")
print(sample_sub.head())
print("Number of test predictions:", len(test_preds))

#filling and saving label column
if len(sample_sub) != len(test_preds):
    print("Length mismatch: sample_sub rows:", len(sample_sub), "| test_preds:", len(test_preds))
else:
    if "label" in sample_sub.columns:
        label_col = "label"
    else:
        label_col = sample_sub.columns[1]

    sample_sub[label_col] = test_preds

    print("\nSubmission preview:")
    print(sample_sub.head())

    sub_path = "subtask_a_codebert_final.csv"
    sample_sub.to_csv(sub_path, index=False)
    print(f"\nSaved submission file: {sub_path}")
