In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import os

os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model
codebert_name = "microsoft/codebert-base"
tokenizer_cb = AutoTokenizer.from_pretrained(codebert_name)
model_cb = AutoModel.from_pretrained(codebert_name).to(device)
model_cb.eval()

#freeze weights
for p in model_cb.parameters():
    p.requires_grad = False

max_length = 256
batch_size = 16


In [None]:
#Convert to HF Dataset
train_ds = Dataset.from_pandas(df_train[["code","label"]], preserve_index=False)
val_ds   = Dataset.from_pandas(df_val[["code","label"]],   preserve_index=False)
test_ds  = Dataset.from_pandas(df_test[["code"]],          preserve_index=False)

#Tokenize
def tok_fn(batch):
    return tokenizer_cb(
        batch["code"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

train_tok = train_ds.map(tok_fn, batched=True, remove_columns=["code"])
val_tok   = val_ds.map(tok_fn,   batched=True, remove_columns=["code"])
test_tok  = test_ds.map(tok_fn,  batched=True, remove_columns=["code"])

train_tok = train_tok.rename_column("label", "labels")
val_tok   = val_tok.rename_column("label", "labels")

train_tok.set_format("torch")
val_tok.set_format("torch")
test_tok.set_format("torch")

print(train_tok, val_tok, test_tok)


In [None]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

def extract_features(hf_dataset):
    loader = DataLoader(hf_dataset, batch_size=batch_size, shuffle=False)

    feats, labels = [], []
    with torch.inference_mode():
        for batch in tqdm(loader, desc="Extracting CodeBERT features"):
            input_ids = batch["input_ids"].to(device)
            attn_mask = batch["attention_mask"].to(device)

            out = model_cb(input_ids=input_ids, attention_mask=attn_mask)
            cls_vec = out.last_hidden_state[:,0,:]

            feats.append(cls_vec.cpu().numpy())
            if "labels" in batch:
                labels.append(batch["labels"].cpu().numpy())

    feats = np.concatenate(feats, axis=0)
    if labels:
        labels = np.concatenate(labels, axis=0)
        return feats, labels
    return feats

X_train, y_train = extract_features(train_tok)
X_val, y_val     = extract_features(val_tok)
X_test           = extract_features(test_tok)

print(X_train.shape, X_val.shape, X_test.shape)


In [None]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import os

# 1) Train Logistic Regression
print("Training Logistic Regression on CodeBERT features...")
lr_model = LogisticRegression(max_iter=1000, C=1.0, solver='liblinear')
lr_model.fit(X_train, y_train)

#Predict on Test data
print("Predicting...")
lr_test_preds = lr_model.predict(X_test)

# Create submission_df
sample_file = "sample_submission_a.csv"
submission_df = pd.read_csv(sample_file)
submission_df["label"] = lr_test_preds

# 4) Save
SAVE_DIR = "/content/drive/MyDrive/semeval_outputs/codebert_lr_256"
os.makedirs(SAVE_DIR, exist_ok=True)

sub_path = f"{SAVE_DIR}/submission_codebert_lr.csv"
submission_df.to_csv(sub_path, index=False)
print("Saved:", sub_path)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd
import os


lr = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    n_jobs=-1
)
lr.fit(X_train, y_train)
print("Logistic Regression trained")


val_probs = lr.predict_proba(X_val)           # [N_val, 2]
val_preds = np.argmax(val_probs, axis=-1)

val_acc = accuracy_score(y_val, val_preds)
val_f1  = f1_score(y_val, val_preds, average="macro")

print("\nCodeBERT Frozen + LR Validation Accuracy:", round(val_acc, 4))
print("\nCodeBERT Frozen + LR Validation Macro F1 :", round(val_f1, 4))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, val_preds))

print("\nClassification Report:")
print(classification_report(y_val, val_preds, digits=4))


test_probs = lr.predict_proba(X_test)         # [N_test, 2]
test_preds = np.argmax(test_probs, axis=-1).astype(int)

print("\nTest probs shape:", test_probs.shape)
print("First 10 test preds:", test_preds[:10])


SAVE_DIR = "/content/drive/MyDrive/semeval_outputs/codebert_lr_256"
os.makedirs(SAVE_DIR, exist_ok=True)
sample_sub = pd.read_csv("sample_submission_a.csv")
sample_sub["label"] = test_preds

sub_path = f"{SAVE_DIR}/subtask_a_codebert_lr_256.csv"
sample_sub.to_csv(sub_path, index=False)

print("\nSubmission CSV saved to Drive:")
print(sub_path)
sample_sub.head()
