# 02_baseline_model.ipynb

## MAP - Charting Student Math Misunderstandings  
**Baseline modeling (classical ML)**

This notebook builds a strong, fast baseline using:
- text concatenation: `QuestionText + MC_Answer + StudentExplanation`
- TF‑IDF features
- Linear classifier (Logistic Regression / Linear SVM)
- validation split (stratified)
- submission file generation: `Category:Misconception`


In [None]:
import sys
from pathlib import Path

# Project root (assuming notebooks/ is alongside src/ and data/)
ROOT_DIR = Path().resolve().parent
sys.path.append(str(ROOT_DIR))

print("ROOT_DIR:", ROOT_DIR)


In [None]:
import numpy as np
import pandas as pd

from src.data_load import load_train, load_test

train = load_train()
test = load_test()

print("train shape:", train.shape)
print("test shape :", test.shape)

train.head()


In [None]:
# --- Column normalization (robust to minor naming differences) ---
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    rename_map = {}

    # Common variants we might encounter
    candidates = {
        "StudentExplanation": ["Student Explanation", "Student_Explanation", "StudentExplanation", "student_explanation"],
        "QuestionText": ["QuestionText", "Question Text", "question_text"],
        "MC_Answer": ["MC_Answer", "MC Answer", "MCAnswer", "mc_answer"],
        "QuestionId": ["QuestionId", "QuestionID", "question_id"],
        "Category": ["Category", "category"],
        "Misconception": ["Misconception", "misconception"],
    }

    for target, variants in candidates.items():
        for v in variants:
            if v in df.columns:
                rename_map[v] = target
                break

    df = df.rename(columns=rename_map)
    return df

train = normalize_columns(train)
test = normalize_columns(test)

required_cols = ["QuestionId", "QuestionText", "MC_Answer", "StudentExplanation"]
missing_train = [c for c in required_cols if c not in train.columns]
missing_test  = [c for c in required_cols if c not in test.columns]

print("Missing in train:", missing_train)
print("Missing in test :", missing_test)

train.columns.tolist()


In [None]:
# --- Target construction: Category:Misconception ---
# Misconception is only meaningful when Category indicates misconception; otherwise it is NA.
def make_catmis(df: pd.DataFrame) -> pd.Series:
    cat = df["Category"].astype(str)
    if "Misconception" in df.columns:
        mc = df["Misconception"]
        mc = mc.where(mc.notna(), "NA").astype(str)
    else:
        mc = pd.Series(["NA"] * len(df), index=df.index)

    return cat + ":" + mc

if "Category" not in train.columns:
    raise ValueError("Train data must contain 'Category' for supervised baseline.")

train["CatMis"] = make_catmis(train)
train["CatMis"].value_counts().head(10)


In [None]:
# --- Text building ---
def build_text(df: pd.DataFrame) -> pd.Series:
    q = df["QuestionText"].fillna("").astype(str)
    a = df["MC_Answer"].fillna("").astype(str)
    e = df["StudentExplanation"].fillna("").astype(str)
    # Using explicit tags often helps linear models
    return ("[Q] " + q + " [A] " + a + " [E] " + e)

train["text"] = build_text(train)
test["text"]  = build_text(test)

train["text"].str.len().describe()


In [None]:
# --- Train/Validation split (stratified) ---
from sklearn.model_selection import train_test_split

X = train["text"].values
y = train["CatMis"].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, "Val:", X_val.shape)


In [None]:
# --- Baseline 0: Majority class ---
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score

major = Counter(y_train).most_common(1)[0][0]
y_pred_major = np.array([major] * len(y_val))

print("Majority label:", major)
print("Accuracy:", accuracy_score(y_val, y_pred_major))
print("Macro F1:", f1_score(y_val, y_pred_major, average="macro"))


In [None]:
# --- Baseline 1: TF‑IDF + Logistic Regression ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

tfidf_lr = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        n_jobs=-1,
        class_weight="balanced"  # helps with label imbalance
    ))
])

tfidf_lr.fit(X_train, y_train)
pred_lr = tfidf_lr.predict(X_val)

print("Accuracy:", accuracy_score(y_val, pred_lr))
print("Macro F1:", f1_score(y_val, pred_lr, average="macro"))
print()
print(classification_report(y_val, pred_lr, digits=4))


In [None]:
# --- Optional: TF‑IDF + Linear SVM (often strong for text) ---
from sklearn.svm import LinearSVC

tfidf_svm = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", LinearSVC(class_weight="balanced"))
])

tfidf_svm.fit(X_train, y_train)
pred_svm = tfidf_svm.predict(X_val)

print("Accuracy:", accuracy_score(y_val, pred_svm))
print("Macro F1:", f1_score(y_val, pred_svm, average="macro"))


In [None]:
# Pick the best model (by macro F1) for submission
best_model = tfidf_lr
best_name = "tfidf_lr"

f1_lr = f1_score(y_val, pred_lr, average="macro")
f1_svm = f1_score(y_val, pred_svm, average="macro")

if f1_svm > f1_lr:
    best_model = tfidf_svm
    best_name = "tfidf_svm"

print("Best model:", best_name, "| macro F1:", max(f1_lr, f1_svm))


In [None]:
# --- Train on full training data ---
best_model.fit(train["text"].values, train["CatMis"].values)


In [None]:
# --- Predict on test and create submission ---
test_pred = best_model.predict(test["text"].values)

sub = pd.DataFrame({
    "Category:Misconception": test_pred
})

# If your competition expects an id column, uncomment and adjust:
# sub.insert(0, "QuestionId", test["QuestionId"].values)

out_path = ROOT_DIR / "submission.csv"
sub.to_csv(out_path, index=False)

out_path, sub.head()


In [None]:
# --- (Optional) Save model artifact ---
import joblib

model_path = ROOT_DIR / f"{best_name}.joblib"
joblib.dump(best_model, model_path)

model_path


## Notes / Next improvements
- Try different text templates (e.g., `QuestionText + StudentExplanation` only).
- Add char‑level ngrams or use `HashingVectorizer` for speed.
- Move to a Transformer baseline (e.g., DeBERTa/RoBERTa) once classical baseline is established.
- Consider a **two‑stage baseline**: predict Category first, then Misconception only when Category indicates misconception.
