# 02_baseline_model.ipynb

## MAP - Charting Student Math Misunderstandings  
**Baseline modeling (classical ML)**

This notebook builds a strong, fast baseline using:
- text concatenation: `QuestionText + MC_Answer + StudentExplanation`
- TF‑IDF features
- Linear classifier (Logistic Regression / Linear SVM)
- validation split (stratified)
- submission file generation: `Category:Misconception`


In [1]:
import sys
from pathlib import Path

# Project root (assuming notebooks/ is alongside src/ and data/)
ROOT_DIR = Path().resolve().parent
sys.path.append(str(ROOT_DIR))

print("ROOT_DIR:", ROOT_DIR)


ROOT_DIR: C:\Users\USER\Desktop\HJ\NLP


In [2]:
import numpy as np
import pandas as pd

from src.data_load import load_train, load_test

train = load_train()
test = load_test()

print("train shape:", train.shape)
print("test shape :", test.shape)

train.head()


train shape: (36696, 7)
test shape : (3, 5)


Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,
2,2,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"1 3rd is half of 3 6th, so it is simplee to un...",True_Neither,
3,3,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 goes into everything and 3 goes into nine,True_Neither,
4,4,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 out of every 3 isn't coloured,True_Correct,


In [3]:
# --- Column normalization (robust to minor naming differences) ---
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    rename_map = {}

    # Common variants we might encounter
    candidates = {
        "StudentExplanation": ["Student Explanation", "Student_Explanation", "StudentExplanation", "student_explanation"],
        "QuestionText": ["QuestionText", "Question Text", "question_text"],
        "MC_Answer": ["MC_Answer", "MC Answer", "MCAnswer", "mc_answer"],
        "QuestionId": ["QuestionId", "QuestionID", "question_id"],
        "Category": ["Category", "category"],
        "Misconception": ["Misconception", "misconception"],
    }

    for target, variants in candidates.items():
        for v in variants:
            if v in df.columns:
                rename_map[v] = target
                break

    df = df.rename(columns=rename_map)
    return df

train = normalize_columns(train)
test = normalize_columns(test)

required_cols = ["QuestionId", "QuestionText", "MC_Answer", "StudentExplanation"]
missing_train = [c for c in required_cols if c not in train.columns]
missing_test  = [c for c in required_cols if c not in test.columns]

print("Missing in train:", missing_train)
print("Missing in test :", missing_test)

train.columns.tolist()


Missing in train: []
Missing in test : []


['row_id',
 'QuestionId',
 'QuestionText',
 'MC_Answer',
 'StudentExplanation',
 'Category',
 'Misconception']

In [4]:
# --- Target construction: Category:Misconception ---
# Misconception is only meaningful when Category indicates misconception; otherwise it is NA.
def make_catmis(df: pd.DataFrame) -> pd.Series:
    cat = df["Category"].astype(str)
    if "Misconception" in df.columns:
        mc = df["Misconception"]
        mc = mc.where(mc.notna(), "NA").astype(str)
    else:
        mc = pd.Series(["NA"] * len(df), index=df.index)

    return cat + ":" + mc

if "Category" not in train.columns:
    raise ValueError("Train data must contain 'Category' for supervised baseline.")

train["CatMis"] = make_catmis(train)
train["CatMis"].value_counts().head(10)


CatMis
True_Correct:NA                       14802
False_Neither:NA                       6542
True_Neither:NA                        5265
False_Misconception:Incomplete         1446
False_Misconception:Additive            891
False_Misconception:Duplication         698
False_Misconception:Subtraction         618
False_Misconception:Positive            564
False_Misconception:Wrong_term          550
False_Misconception:Wrong_fraction      412
Name: count, dtype: int64

In [5]:
# --- Text building ---
def build_text(df: pd.DataFrame) -> pd.Series:
    q = df["QuestionText"].fillna("").astype(str)
    a = df["MC_Answer"].fillna("").astype(str)
    e = df["StudentExplanation"].fillna("").astype(str)
    # Using explicit tags often helps linear models
    return ("[Q] " + q + " [A] " + a + " [E] " + e)

train["text"] = build_text(train)
test["text"]  = build_text(test)

train["text"].str.len().describe()


count    36696.000000
mean       194.643149
std         76.867902
min         68.000000
25%        127.000000
50%        186.000000
75%        248.000000
max        655.000000
Name: text, dtype: float64

In [7]:
from collections import Counter

Counter(y)

Counter({'True_Correct:NA': 14802,
         'False_Neither:NA': 6542,
         'True_Neither:NA': 5265,
         'False_Misconception:Incomplete': 1446,
         'False_Misconception:Additive': 891,
         'False_Misconception:Duplication': 698,
         'False_Misconception:Subtraction': 618,
         'False_Misconception:Positive': 564,
         'False_Misconception:Wrong_term': 550,
         'False_Misconception:Wrong_fraction': 412,
         'False_Misconception:Irrelevant': 409,
         'False_Misconception:Inversion': 409,
         'False_Misconception:Mult': 345,
         'False_Misconception:Denominator-only_change': 332,
         'False_Misconception:Whole_numbers_larger': 328,
         'False_Misconception:Adding_across': 306,
         'False_Misconception:WNB': 291,
         'False_Misconception:Unknowable': 282,
         'False_Misconception:Wrong_Fraction': 273,
         'False_Correct:NA': 227,
         'False_Misconception:SwapDividend': 198,
         'False_Misconcep

In [8]:
# --- Train/Validation split (stratified) ---

# NOTE:
# Some Category–Misconception classes appear only once in the dataset.
# Stratified splitting requires at least 2 samples per class,
# so we remove singleton classes for the baseline experiment.


from sklearn.model_selection import train_test_split
import pandas as pd

# 1. 최소 샘플 수 확인
vc = train["CatMis"].value_counts()

# 2. stratify 가능한 클래스만 남기기 (>=2)
valid_classes = vc[vc >= 2].index

train_strat = train[train["CatMis"].isin(valid_classes)].reset_index(drop=True)

# 3. split
X = train_strat["text"].values
y = train_strat["CatMis"].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, "Val:", X_val.shape)
print("Removed samples:", len(train) - len(train_strat))


Train: (29352,) Val: (7339,)
Removed samples: 5


In [9]:
# --- Baseline 0: Majority class ---
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score

major = Counter(y_train).most_common(1)[0][0]
y_pred_major = np.array([major] * len(y_val))

print("Majority label:", major)
print("Accuracy:", accuracy_score(y_val, y_pred_major))
print("Macro F1:", f1_score(y_val, y_pred_major, average="macro"))


Majority label: True_Correct:NA
Accuracy: 0.4034609619839215
Macro F1: 0.010453662842012356


In [10]:
# --- Baseline 1: TF‑IDF + Logistic Regression ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

tfidf_lr = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        n_jobs=-1,
        class_weight="balanced"  # helps with label imbalance
    ))
])

tfidf_lr.fit(X_train, y_train)
pred_lr = tfidf_lr.predict(X_val)

print("Accuracy:", accuracy_score(y_val, pred_lr))
print("Macro F1:", f1_score(y_val, pred_lr, average="macro"))
print()
print(classification_report(y_val, pred_lr, digits=4))


Accuracy: 0.42103828859517645
Macro F1: 0.3268523346672585

                                                            precision    recall  f1-score   support

                                          False_Correct:NA     0.1280    0.3556    0.1882        45
                         False_Misconception:Adding_across     0.6250    0.9836    0.7643        61
                          False_Misconception:Adding_terms     0.6207    0.9474    0.7500        19
                              False_Misconception:Additive     0.5153    0.9438    0.6667       178
                             False_Misconception:Base_rate     0.3077    1.0000    0.4706         4
                             False_Misconception:Certainty     0.0952    1.0000    0.1739         4
                            False_Misconception:Definition     0.0943    1.0000    0.1724        10
               False_Misconception:Denominator-only_change     0.3824    0.9848    0.5508        66
                              False_Mis

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [11]:
import numpy as np
from collections import Counter

pred_counts = Counter(pred_lr)
true_counts = Counter(y_val)

no_pred = [c for c in true_counts if pred_counts.get(c, 0) == 0]
no_true = [c for c in pred_counts if true_counts.get(c, 0) == 0]  # 거의 없을 수도

print("Classes with NO predicted samples (but exist in val):", len(no_pred))
print(no_pred[:20])

Classes with NO predicted samples (but exist in val): 2
['True_Misconception:Inversion', 'False_Misconception:Incorrect_equivalent_fraction_addition']


In [12]:
# --- Optional: TF‑IDF + Linear SVM (often strong for text) ---
from sklearn.svm import LinearSVC

tfidf_svm = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", LinearSVC(class_weight="balanced"))
])

tfidf_svm.fit(X_train, y_train)
pred_svm = tfidf_svm.predict(X_val)

print("Accuracy:", accuracy_score(y_val, pred_svm))
print("Macro F1:", f1_score(y_val, pred_svm, average="macro"))


Accuracy: 0.7090884316664395
Macro F1: 0.4446113131424601


In [13]:
# Pick the best model (by macro F1) for submission
best_model = tfidf_lr
best_name = "tfidf_lr"

f1_lr = f1_score(y_val, pred_lr, average="macro")
f1_svm = f1_score(y_val, pred_svm, average="macro")

if f1_svm > f1_lr:
    best_model = tfidf_svm
    best_name = "tfidf_svm"

print("Best model:", best_name, "| macro F1:", max(f1_lr, f1_svm))


Best model: tfidf_svm | macro F1: 0.4446113131424601


In [14]:
# --- Train on full training data ---
best_model.fit(train["text"].values, train["CatMis"].values)


0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [15]:
# --- Predict on test and create submission ---
test_pred = best_model.predict(test["text"].values)

sub = pd.DataFrame({
    "Category:Misconception": test_pred
})

# If your competition expects an id column, uncomment and adjust:
# sub.insert(0, "QuestionId", test["QuestionId"].values)

out_path = ROOT_DIR / "submission.csv"
sub.to_csv(out_path, index=False)

out_path, sub.head()


(WindowsPath('C:/Users/USER/Desktop/HJ/NLP/submission.csv'),
     Category:Misconception
 0          True_Neither:NA
 1  False_Misconception:WNB
 2          True_Neither:NA)

In [17]:
# --- (Optional) Save model artifact ---
import joblib

model_path = ROOT_DIR / f"{best_name}.joblib"
joblib.dump(best_model, model_path)

model_path


WindowsPath('C:/Users/USER/Desktop/HJ/NLP/tfidf_svm.joblib')

## Notes / Next improvements
- Try different text templates (e.g., `QuestionText + StudentExplanation` only).
- Add char‑level ngrams or use `HashingVectorizer` for speed.
- Move to a Transformer baseline (e.g., DeBERTa/RoBERTa) once classical baseline is established.
- Consider a **two‑stage baseline**: predict Category first, then Misconception only when Category indicates misconception.
