# Model Evaluation

This notebook assumes the data have been preprocessed by the notebook `notebooks/03-preprocess_data.ipynb`.

Read the processed data.

In [4]:
import pandas as pd

df_train = pd.read_csv("../data/processed/train.csv")
X_train, y_train = df_train["text"], df_train["label"]

df_test = pd.read_csv("../data/processed/test.csv")
X_test, y_test = df_test["text"], df_test["label"]

A helper function for doing cross-validation.

In [5]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

def fit_and_evaluate(model, X, y, n_splits=5):
    """Fit and evaluate each model."""
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in kf.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]

        vectorizer = TfidfVectorizer(min_df=10)
        transformed_X_train = vectorizer.fit_transform(X_train)
        transformed_X_val = vectorizer.transform(X_val)

        model.fit(transformed_X_train, y_train)
        y_pred = model.predict(transformed_X_val)
        auc_scores.append(roc_auc_score(y_val, y_pred))

    auc_scores = np.array(auc_scores)
    metrics = {
        "mean": np.mean(auc_scores),
        "std": np.std(auc_scores),
    }
    return metrics

Run a 5-fold cross-validation on different models.

In [6]:
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

model_dict = {
    "logistic-regression": LogisticRegression(),
    "k-nearest-neighbors": KNeighborsClassifier(),
    "random-forest": RandomForestClassifier(),
    "gradient-boosting-machine": GradientBoostingClassifier(),
    "support-vector-machine": LinearSVC(),
}

performance = {}
for model_name, model_class in model_dict.items():
    performance[model_name] = fit_and_evaluate(model_class, X_train, y_train)
print(json.dumps(performance, indent=2))

{
  "logistic-regression": {
    "mean": 0.7562863309593524,
    "std": 0.019642962156474545
  },
  "k-nearest-neighbors": {
    "mean": 0.6906516560367517,
    "std": 0.022377017060785254
  },
  "random-forest": {
    "mean": 0.7232899886800905,
    "std": 0.029347233658478362
  },
  "gradient-boosting-machine": {
    "mean": 0.7017718608251133,
    "std": 0.025234778387473736
  },
  "support-vector-machine": {
    "mean": 0.7236191172970615,
    "std": 0.03117893968722548
  }
}


Use the best model on the full training set and evaluate on the test set.

In [7]:
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(min_df=10), LogisticRegression())

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

roc_auc_score(y_test, y_pred)

0.6986111111111112

Save the model

In [8]:
import pickle

with open("../artifacts/model.pickle", "wb") as f:
    pickle.dump(model, f)

In [9]:
top_n = 25
features = model.steps[0][1].get_feature_names()
coefs = np.squeeze(model.steps[1][1].coef_)

ids = np.argsort(-abs(coefs))[:top_n]
top_n_features = [features[i] for i in ids]
top_n_coefs = coefs[ids]
pd.DataFrame({"feature": top_n_features, "coef": top_n_coefs})

Unnamed: 0,feature,coef
0,paper,-2.474073
1,course,2.191506
2,learn,1.917166
3,research,-1.910634
4,models,-1.838475
5,python,1.819297
6,learning,1.697943
7,papers,-1.693024
8,start,1.662301
9,regression,1.500483
