In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from transformers.modeling_outputs import SequenceClassifierOutput

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

#disable wandb
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"


In [None]:
#mount google drive
from google.colab import drive
drive.mount("/content/drive")


In [None]:
#loading finetuned codebert from drive
CODEBERT_DIR = "/content/drive/MyDrive/semeval_task13_models/task_a_codebert_v1"

#load tokenizer & encoder(no classifier head)
tokenizer = AutoTokenizer.from_pretrained(CODEBERT_DIR)
codebert_encoder = AutoModel.from_pretrained(CODEBERT_DIR)

#ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token or tokenizer.cls_token


In [None]:
!pip -q install kaggle

import os, shutil
from google.colab import files

os.makedirs('/root/.kaggle', exist_ok=True)
print("Upload kaggle.json ")
uploaded = files.upload()

fname = next(iter(uploaded.keys()))
shutil.move(fname, '/root/.kaggle/kaggle.json')
os.chmod('/root/.kaggle/kaggle.json', 0o600)

!kaggle --version
!kaggle datasets list -s "semeval task 13" | head -n 10

In [None]:
SLUG = "daniilor/semeval-2026-task13"
TARGET = "/content/semeval_task13"
!mkdir -p "$TARGET"
!kaggle datasets download -d "$SLUG" -p "$TARGET"
!unzip -o "$TARGET"/*.zip -d "$TARGET"

In [None]:
#step 1:
from pathlib import Path

import pandas as pd
import numpy as np
import joblib

from sklearn.metrics import f1_score, accuracy_score

ROOT = Path("/content/semeval_task13")

#handling extra folder layer if present
if (ROOT / "SemEval-2026-Task13").exists():
    BASE_DIR = ROOT / "SemEval-2026-Task13"
else:
    BASE_DIR = ROOT

TASK_A_DIR = BASE_DIR / "task_a"

print("BASE_DIR:", BASE_DIR)
print("TASK_A_DIR:", TASK_A_DIR)
print("Files in task_a:")
for p in TASK_A_DIR.iterdir():
    print(" -", p.name)


In [None]:
#load subtask A data

train_path = TASK_A_DIR / "task_a_training_set_1.parquet"
val_path   = TASK_A_DIR / "task_a_validation_set.parquet"
test_path  = TASK_A_DIR /"task_a_test_set_sample.parquet"

df_train = pd.read_parquet(train_path)
df_val   = pd.read_parquet(val_path)
df_test  = pd.read_parquet(test_path)

print("Train shape:", df_train.shape)
print("Val shape  :", df_val.shape)
print("Test shape :", df_test.shape)
print("\nColumns:", df_train.columns.tolist())

df_train.head()


In [None]:
#basic cleaning
df_train = df_train[["code", "label"]].dropna()
df_val   = df_val[["code", "label"]].dropna()
df_test  = df_test[["code"]].copy()  # test has no labels

df_train["label"] = df_train["label"].astype(int)
df_val["label"]   = df_val["label"].astype(int)

print("Train shape:", df_train.shape)
print("Val shape  :", df_val.shape)
print("Test shape :", df_test.shape)


In [None]:
#convert to HF dataset and tokenize
train_ds = Dataset.from_pandas(df_train, preserve_index=False)
val_ds   = Dataset.from_pandas(df_val,   preserve_index=False)
test_ds  = Dataset.from_pandas(df_test,  preserve_index=False)

train_ds = train_ds.rename_column("label", "labels")
val_ds   = val_ds.rename_column("label", "labels")

max_length = 256

def tokenize_fn(batch):
    return tokenizer(
        batch["code"],
        truncation=True,
        max_length=max_length,
    )

train_tok = train_ds.map(tokenize_fn, batched=True)
val_tok   = val_ds.map(tokenize_fn,   batched=True)
test_tok  = test_ds.map(tokenize_fn,  batched=True)

#remove raw text column
train_tok = train_tok.remove_columns(["code"])
val_tok   = val_tok.remove_columns(["code"])
test_tok  = test_tok.remove_columns(["code"])

#set format for PyTorch
train_tok.set_format(type="torch")
val_tok.set_format(type="torch")
test_tok.set_format(type="torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(train_tok)
print(val_tok)
print(test_tok)


In [None]:
#hybrid model(codebert+BiLSTM+linear layer)
class CodeBertBiLSTMClassifier(nn.Module):
    def __init__(
        self,
        encoder,
        num_labels=2,
        lstm_hidden_size=256,
        lstm_num_layers=1,
        bidirectional=True,
        dropout=0.1,
        freeze_encoder=True,
    ):
        super().__init__()
        self.encoder = encoder
        self.num_labels = num_labels

        hidden_size = encoder.config.hidden_size  # 768 for base CodeBERT

        #optionally freeze encoder to train only BiLSTM + classifier
        if freeze_encoder:
            for param in self.encoder.parameters():
                param.requires_grad = False

        self.lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=lstm_hidden_size,
            num_layers=lstm_num_layers,
            batch_first=True,
            bidirectional=bidirectional,
        )

        lstm_out_dim = lstm_hidden_size * (2 if bidirectional else 1)

        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(lstm_out_dim, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        labels=None,
        **kwargs,
    ):
        #encoder outputs(sequence of hidden states)
        encoder_outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        #[batch, seq_len, hidden]
        last_hidden_state = encoder_outputs.last_hidden_state

        #BiLSTM over token sequence
        lstm_out, _ = self.lstm(last_hidden_state)  # batch, seq_len, lstm_out_dim]

        #use last time step
        last_hidden = lstm_out[:, -1, :]            #[batch, lstm_out_dim]

        logits = self.classifier(self.dropout(last_hidden))

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=None,
            attentions=None,
        )

#instantiate hybrid model(encoder frozen)
hybrid_model = CodeBertBiLSTMClassifier(
    encoder=codebert_encoder,
    num_labels=2,
    lstm_hidden_size=256,
    lstm_num_layers=1,
    bidirectional=True,
    dropout=0.1,
    freeze_encoder=True,
)


In [None]:
#metric function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    macro_f1 = f1_score(labels, preds, average="macro")
    acc      = accuracy_score(labels, preds)

    return {"macro_f1": macro_f1, "accuracy": acc}


In [None]:
#TrainingArguments & Trainer

batch_size = 16

training_args = TrainingArguments(
    output_dir="task_a_codebert_bilstm",   #temporary folder
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    learning_rate=2e-4,
    weight_decay=0.01,
    logging_steps=200,
    save_strategy="no",
    logging_dir="logs_codebert_bilstm",
    report_to=[],
)

trainer_hybrid = Trainer(
    model=hybrid_model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
#training and evaluation of hybrid model
trainer_hybrid.train()

eval_results = trainer_hybrid.evaluate(eval_dataset=val_tok)
print("Hybrid Validation results:", eval_results)


In [None]:
#detailed validation metrics
hyb_val_outputs = trainer_hybrid.predict(val_tok)
hyb_val_logits  = hyb_val_outputs.predictions
hyb_val_preds   = hyb_val_logits.argmax(axis=-1)

y_val_true = df_val["label"].astype(int).values

val_acc = accuracy_score(y_val_true, hyb_val_preds)
val_f1  = f1_score(y_val_true, hyb_val_preds, average="macro")

print(f"\nHybrid Validation Accuracy : {val_acc:.4f}")
print(f"Hybrid Validation Macro F1 : {val_f1:.4f}")

cm = confusion_matrix(y_val_true, hyb_val_preds)
print("\nHybrid Confusion Matrix (rows=true, cols=pred):")
print(cm)

print("\nHybrid Classification Report:")
print(classification_report(y_val_true, hyb_val_preds, digits=4))


In [None]:
#prediction on test set
test_outputs = trainer_hybrid.predict(test_tok)
test_logits  = test_outputs.predictions
test_preds   = test_logits.argmax(axis=-1).astype(int)

print("Hybrid test preds shape:", test_preds.shape)
print("First 10 hybrid preds:", test_preds[:10])


In [None]:
sample_filename = "sample_submission_a.csv"
sample_sub = pd.read_csv(sample_filename)

print("Sample submission shape:", sample_sub.shape)
print("Number of test predictions:", len(test_preds))

if len(sample_sub) != len(test_preds):
    print("length mismatch: sample_sub rows:", len(sample_sub), "| test_preds:", len(test_preds))
else:
    if "label" in sample_sub.columns:
        label_col = "label"
    else:
        label_col = sample_sub.columns[1]

    sample_sub[label_col] = test_preds

    sub_path = "subtask_a_codebert_bilstm.csv"
    sample_sub.to_csv(sub_path, index=False)
    print(f"\nsaved hybrid submission file: {sub_path}")
    print(sample_sub.head())
