In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import accuracy_score, f1_score, classification_report
from google.colab import files



#validation and test data -- downloaded dataset from semeval task 2026 task 13
val_path  = "validation.parquet"
test_path = "test.parquet"

df_val  = pd.read_parquet(val_path)
df_test = pd.read_parquet(test_path)

print("Val shape :", df_val.shape)
print("Test shape:", df_test.shape)
print("Val columns:", df_val.columns.tolist())

#truth labels
y_true = df_val["label"].astype(int).values

#probabilities from npy files
codebert_dev  = np.load("codebert_dev_probs_en.npy")
codebert_test = np.load("codebert_test_probs_en.npy")

graph_dev  = np.load("Graphcodebert_dev_probs_en.npy")
graph_test = np.load("Graphcodebert_test_probs_en.npy")

unix_dev  = np.load("unixcoder_dev_probs_weighted.npy")
unix_test = np.load("unixcoder_test_probs_weighted.npy")

print("Shapes:")
print("CodeBERT   validation:", codebert_dev.shape,   "test:", codebert_test.shape)
print("GraphCodeB validation:", graph_dev.shape,      "test:", graph_test.shape)
print("UniXcoder  validation:", unix_dev.shape,       "test:", unix_test.shape)

assert codebert_dev.shape[0] == len(df_val)
assert graph_dev.shape[0]    == len(df_val)
assert unix_dev.shape[0]     == len(df_val)


MODEL_KEYS      = ["codebert", "graphcodebert", "unixcoder"]
dev_probs_list  = [codebert_dev, graph_dev, unix_dev]
test_probs_list = [codebert_test, graph_test, unix_test]



print("\neach model evaluation metrics")
model_f1s = []

for key, dev_p in zip(MODEL_KEYS, dev_probs_list):
    preds = dev_p.argmax(axis=1)
    acc   = accuracy_score(y_true, preds)
    f1    = f1_score(y_true, preds, average="macro")
    model_f1s.append(f1)
    print(f"{key:12s} â€” Accuracy: {acc:.4f} | Macro-F1: {f1:.6f}")

#weighted ensemble
weights = np.array(model_f1s, dtype="float64")
weights = weights / weights.sum()

print("\nEnsemble weights by validation macro-F1):")
for k, w in zip(MODEL_KEYS, weights):
    print(f"  {k:12s}: {w:.4f}")

#stack: num_models, num_dev, num_classes
dev_stack = np.stack(dev_probs_list, axis=0)

#weighted average probs over models
dev_ens_probs = np.tensordot(weights, dev_stack, axes=(0, 0))
dev_ens_preds = dev_ens_probs.argmax(axis=1)

ens_acc = accuracy_score(y_true, dev_ens_preds)
ens_f1  = f1_score(y_true, dev_ens_preds, average="macro")

print("\nWeighted Ensemble (CodeBERT + GraphCodeBERT + UniXcoder)")
print(f"validation Accuracy: {ens_acc:.6f}")
print(f"validation Macro-F1: {ens_f1:.6f}")

print("\nClassification report (ensemble):")
print(classification_report(y_true, dev_ens_preds, digits=3))


#ensemble test
test_stack     = np.stack(test_probs_list, axis=0)
test_ens_probs = np.tensordot(weights, test_stack, axes=(0, 0))
test_ens_preds = test_ens_probs.argmax(axis=1).astype(int)

print("\nTest ensemble predictions:")
print("Num test preds:", len(test_ens_preds))
print("First 10 labels:", test_ens_preds[:10])


test_labels_for_sub = test_ens_preds
#submission csv
print("\nUploading sample submission file")
uploaded = files.upload()

sample_filename = next(iter(uploaded.keys()))
print("Loaded file:", sample_filename)

sample_sub = pd.read_csv(sample_filename)
print("Sample submission shape:", sample_sub.shape)
print(sample_sub.head())

if len(sample_sub) != len(test_labels_for_sub):
    print("Length mismatch: sample_sub rows:", len(sample_sub), "| test_preds:", len(test_labels_for_sub))
else:
    if "label" in sample_sub.columns:
        label_col = "label"
    else:
        label_col = sample_sub.columns[1]

    sample_sub[label_col] = test_labels_for_sub

    print("\nSubmission preview (ensemble):")
    print(sample_sub.head())

    out_name = "subtask_b_ensemble_weighted.csv"
    sample_sub.to_csv(out_name, index=False)
    print(f"\nSaved ensemble submission file: {out_name}")
