In [None]:
!pip -q install transformers

from pathlib import Path
import numpy as np
import pandas as pd

import torch
from torch.nn.functional import normalize
from transformers import AutoTokenizer, AutoModel

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from google.colab import files


In [None]:

ROOT = Path("/content/semeval_task13")
if (ROOT / "SemEval-2026-Task13").exists():
    BASE_DIR = ROOT / "SemEval-2026-Task13"
else:
    BASE_DIR = ROOT

TASK_B_DIR = BASE_DIR / "task_b"
print("TASK_B_DIR:", TASK_B_DIR)

train_path = TASK_B_DIR / "task_b_training_set.parquet"
val_path   = TASK_B_DIR / "task_b_validation_set.parquet"
test_path  = "test.parquet"

df_train = pd.read_parquet(train_path)
df_val   = pd.read_parquet(val_path)
df_test  = pd.read_parquet(test_path)

print("Train:", df_train.shape, "Val:", df_val.shape, "Test:", df_test.shape)
print(df_train.head(2))


In [None]:
MODEL_NAME = "microsoft/codebert-base"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
encoder   = AutoModel.from_pretrained(MODEL_NAME)
encoder.to(device)
encoder.eval()

print("Hidden size:", encoder.config.hidden_size)


In [None]:
def encode_texts_to_embeddings(texts, batch_size=16, max_length=256):
    all_embs = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        enc = tokenizer(
            list(batch_texts),
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        ).to(device)

        with torch.no_grad():
            outputs = encoder(**enc)  #last_hidden_state:(B, L, H)
            hidden = outputs.last_hidden_state          #(B, L, H)
            mask   = enc["attention_mask"].unsqueeze(-1) #(B, L, 1)

            #Mean pooling only over real tokens
            summed  = (hidden * mask).sum(dim=1)  #(B, H)
            counts  = mask.sum(dim=1).clamp(min=1) #(B, 1)
            mean_pooled = summed / counts         #(B, H)

            embs = mean_pooled.cpu().numpy()
            all_embs.append(embs)

    return np.vstack(all_embs) #(N, H)


In [None]:
X_train_texts = df_train["code"].astype(str).values
X_val_texts   = df_val["code"].astype(str).values
X_test_texts  = df_test["code"].astype(str).values

y_train = df_train["label"].astype(int).values
y_val   = df_val["label"].astype(int).values

print("Encoding train...")
X_train = encode_texts_to_embeddings(X_train_texts, batch_size=16, max_length=256)
print("Encoding val...")
X_val   = encode_texts_to_embeddings(X_val_texts,   batch_size=16, max_length=256)
print("Encoding test...")
X_test  = encode_texts_to_embeddings(X_test_texts,  batch_size=16, max_length=256)

print("X_train shape:", X_train.shape)
print("X_val shape  :", X_val.shape)
print("X_test shape :", X_test.shape)


In [None]:
lr = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    n_jobs=-1,
    C=1.0,
)

print("Training LR on frozen CodeBERT features")
lr.fit(X_train, y_train)
print("successfully completed")


In [None]:
val_preds = lr.predict(X_val)

val_acc = accuracy_score(y_val, val_preds)
val_f1  = f1_score(y_val, val_preds, average="macro")

print(f"Frozen CodeBERT+LR â€” validation Accuracy: {val_acc:.4f} | Macro-F1: {val_f1:.4f}")
print("\nClassification report:\n")
print(classification_report(y_val, val_preds, digits=3))


In [None]:
test_preds = lr.predict(X_test).astype(int)
print("Num test preds:", len(test_preds))
print("First 10 test labels:", test_preds[:10])


In [None]:
sample_filename = "sample_submission_b.csv"
sample_sub = pd.read_csv(sample_filename)
print("Sample submission shape:", sample_sub.shape)

if len(sample_sub) != len(test_preds):
    print(" Length mismatch:", len(sample_sub), "vs", len(test_preds))
else:
    if "label" in sample_sub.columns:
        label_col = "label"
    else:
        label_col = sample_sub.columns[1]

    sample_sub[label_col] = test_preds
    print("\nSubmission preview (Frozen CodeBERT + LR):")
    print(sample_sub.head())

    out_name = "subtask_b_frozen_codebert_lr.csv"
    sample_sub.to_csv(out_name, index=False)
    print(f"\n Saved submission file: {out_name}")
