In [None]:
import os, shutil
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib


In [None]:
#base directory
ROOT = Path(TARGET)
if (ROOT / "SemEval-2026-Task13").exists():
    BASE_DIR = ROOT / "SemEval-2026-Task13"
else:
    BASE_DIR = ROOT

TASK_B_DIR = BASE_DIR / "task_b"
print("TASK_B_DIR:", TASK_B_DIR)

#loading Subtask B
train_path = TASK_B_DIR / "task_b_training_set.parquet"
val_path   = TASK_B_DIR / "task_b_validation_set.parquet"
test_path  = TASK_B_DIR /"task_b_test_set_sample.parquet"

df_train = pd.read_parquet(train_path)
df_val   = pd.read_parquet(val_path)
df_test  = pd.read_parquet(test_path)

print("Train shape:", df_train.shape)
print("Val shape  :", df_val.shape)
print("Test shape :", df_test.shape)
print("Train columns:", df_train.columns.tolist())
df_train.head()

In [None]:
y_train = df_train["label"].astype(int).values
y_val   = df_val["label"].astype(int).values

X_train_texts = df_train["code"].astype(str).values
X_val_texts   = df_val["code"].astype(str).values
X_test_texts  = df_test["code"].astype(str).values

print("Example code snippet:\n", X_train_texts[0][:300])



In [None]:
tfidf = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 6),
    min_df=5,
    max_features=200_000,
)

print("Fitting TF-IDF on train...")
X_train = tfidf.fit_transform(X_train_texts)
X_val  = tfidf.transform(X_val_texts)
X_test = tfidf.transform(X_test_texts)

print("X_train shape:", X_train.shape)
print("X_val shape  :", X_val.shape)
print("X_test shape :", X_test.shape)


In [None]:
lr = LogisticRegression(
    C=1.0,
    max_iter=200,
    n_jobs=-1,
    solver="saga",
    multi_class="multinomial",
    verbose=1,
)

print("Training Logistic Regression...")
lr.fit(X_train, y_train)

print("training successfully completed.")


In [None]:
val_preds = lr.predict(X_val)

val_acc = accuracy_score(y_val, val_preds)
val_f1  = f1_score(y_val, val_preds, average="macro")

print(f"Validation â€” Accuracy: {val_acc:.4f} | Macro F1: {val_f1:.4f}")
print("\nClassification report:\n")
print(classification_report(y_val, val_preds, digits=3))


In [None]:
from google.colab import files

print("Please upload your Task B sample_submission.csv file (from the competition site)...")
uploaded_sub = files.upload()

sample_filename = next(iter(uploaded_sub.keys()))
print("Loaded file:", sample_filename)

sample_sub = pd.read_csv(sample_filename)
print("Sample submission shape:", sample_sub.shape)
print(sample_sub.head())


In [None]:
test_preds = lr.predict(X_test).astype(int)
print("Number of test predictions:", len(test_preds))
print("First 10 test predictions:", test_preds[:10])


In [None]:
if len(sample_sub) != len(test_preds):
    print("length mismatch: sample_sub rows:", len(sample_sub), "| test_preds:", len(test_preds))
else:
    if "label" in sample_sub.columns:
        label_col = "label"
    else:
        label_col = sample_sub.columns[1]  # fallback

    sample_sub[label_col] = test_preds

    print("\nSubmission preview:")
    print(sample_sub.head())

    out_name = "subtask_b_tfidf_lr.csv"
    sample_sub.to_csv(out_name, index=False)
    print(f"\nSaved submission file: {out_name}")
