# Model Evaluation

This notebook assumes the data have been preprocessed by the notebook `notebooks/03-preprocess_data.ipynb`.

Read the processed data.

In [25]:
import pandas as pd

df_train = pd.read_csv("../data/processed/train.csv")
X_train, y_train = df_train["text"], df_train["label"]

df_test = pd.read_csv("../data/processed/test.csv")
X_test, y_test = df_test["text"], df_test["label"]

A helper function for doing cross-validation.

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold

def fit_and_evaluate(model, X, y, n_splits=5):
    """Fit and evaluate each model."""
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracy = []
    precision = []
    recall = []
    f1 = []

    for train_idx, val_idx in kf.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]

        vectorizer = TfidfVectorizer(min_df=10)
        transformed_X_train = vectorizer.fit_transform(X_train)
        transformed_X_val = vectorizer.transform(X_val)

        model.fit(transformed_X_train, y_train)
        y_pred = model.predict(transformed_X_val)

        accuracy.append(accuracy_score(y_val, y_pred))
        precision.append(precision_score(y_val, y_pred))
        recall.append(recall_score(y_val, y_pred))
        f1.append(f1_score(y_val, y_pred))

    metrics = {
        "Accuracy": sum(accuracy) / n_splits,
        "Precision": sum(precision) / n_splits,
        "Recall": sum(recall) / n_splits,
        "F1": sum(f1) / n_splits,
    }
    return metrics

Run a 5-fold cross-validation on different models.

In [30]:
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

model_dict = {
    "logistic-regression": LogisticRegression(),
    "k-nearest-neighbors": KNeighborsClassifier(),
    "random-forest": RandomForestClassifier(),
    "gradient-boosting-machine": GradientBoostingClassifier(),
    "support-vector-machine": LinearSVC(),
}

performance = {}
for model_name, model_class in model_dict.items():
    performance[model_name] = fit_and_evaluate(model_class, X_train, y_train)
print(json.dumps(performance, indent=2))

{
  "logistic-regression": {
    "Accuracy": 0.7563828164717845,
    "Precision": 0.7727123573719143,
    "Recall": 0.6635658914728683,
    "F1": 0.7137579476645841
  },
  "k-nearest-neighbors": {
    "Accuracy": 0.6994407727503813,
    "Precision": 0.6914216134580616,
    "Recall": 0.6198522286821706,
    "F1": 0.6530055670151128
  },
  "random-forest": {
    "Accuracy": 0.7336019318759532,
    "Precision": 0.7391082341143106,
    "Recall": 0.6526768410852714,
    "F1": 0.6917812418647439
  },
  "gradient-boosting-machine": {
    "Accuracy": 0.7165124555160143,
    "Precision": 0.7093055915651197,
    "Recall": 0.6449006782945736,
    "F1": 0.6754061804363788
  },
  "support-vector-machine": {
    "Accuracy": 0.7314438230808337,
    "Precision": 0.7149764844988908,
    "Recall": 0.6901041666666666,
    "F1": 0.7015886067955033
  }
}


Use the best model on the full training set and evaluate on the test set.

In [33]:
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(min_df=10), LogisticRegression())

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

performance = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred),
}
print(json.dumps(performance, indent=2))

{
  "Accuracy": 0.7770700636942676,
  "Precision": 0.7846153846153846,
  "Recall": 0.7083333333333334,
  "F1": 0.7445255474452555
}


In [34]:
import pickle

with open("../artifacts/model.pickle", "wb") as f:
    pickle.dump(model, f)