# 03 - Models

Models:
- Logistic regression
- kNN
- Random Forest
- XGBoost
- Categorical Naive Bayes
- SVM
- Neural Network

In [None]:
import numpy as np
import pandas as pd
import joblib
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score


SEED = 42
np.random.seed(SEED)

In [None]:
# Load data
df = pd.read_csv("./data/processed/german.csv")

y = df["credit_risk"]
X = df.drop(columns=["credit_risk", "id"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

## Shared code for all models

In [None]:
# used in transforming the data with ColumnTransformer()
numeric_features = X.select_dtypes(include="number").columns
categorical_features = X.select_dtypes(exclude="number").columns

In [None]:
## output best params found in hyperparameter tuning

# using same scoring metrics across all four models
metrics = ["accuracy", "f1", "precision", "recall", "roc_auc"]
scoring = {
    "accuracy": "accuracy",
    "f1": make_scorer(f1_score, zero_division=0),
    "precision": make_scorer(precision_score, zero_division=0),
    "recall": make_scorer(recall_score, zero_division=0),
    "roc_auc": "roc_auc",
}

## Logistic regression

np.logspace(-4,4,20) = (array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
        4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
        2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
        1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
        5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),)

In [None]:
## preprocessing
# thanks to: https://www.youtube.com/watch?v=tIO8zPCdi58
# thanks to: https://www.geeksforgeeks.org/machine-learning/how-to-optimize-logistic-regression-performance/

preprocess_lr = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

pipeline_lr = Pipeline([
    ("preprocess", preprocess_lr),
    ("model", LogisticRegression(random_state=SEED))
])

# "The main hyperparameters we may tune in logistic regression are: solver, penalty, and regularization strength (sklearn documentation)."
#   from: https://medium.com/codex/do-i-need-to-tune-logistic-regression-hyperparameters-1cb2b81fca69
# "Logistic regression does not really have any critical hyperparameters to tune."
#   from: https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

param_grid_lr = [
    {'model__penalty': ['l1'], 'model__solver': ['liblinear', 'saga'], 'model__C': np.logspace(-4, 4, 20), "model__max_iter": [100,1000,2500,5000]},
    {'model__penalty': ['l2'], 'model__solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'], 'model__C': np.logspace(-4, 4, 20), "model__max_iter": [100,1000,2500,5000]},
    {'model__penalty': ['elasticnet'], 'model__solver': ['saga'], 'model__C': np.logspace(-4, 4, 20), 'model__l1_ratio': [0.25, 0.5, 0.75], "model__max_iter": [100,1000,2500,5000]}
]

grid_lr = GridSearchCV(
    estimator=pipeline_lr,  
    param_grid=param_grid_lr, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=2
)

grid_lr.fit(X_train, y_train)

# Fitting 5 folds for each of 480 candidates, totalling 2400 fits
# ran for like 4m...

## kNN

In [None]:
## preprocessing
# thanks to: https://medium.com/@agrawalsam1997/hyperparameter-tuning-of-knn-classifier-a32f31af25c7
#   for this: np.arange(2, 30, 1)

# i purposely don't drop="first"
preprocess_knn = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

pipeline_knn = Pipeline([
    ("preprocess", preprocess_knn),
    ("model", KNeighborsClassifier())
])

# notice that n=1000 and sqrt(1000) ~= 31
param_grid_knn = {
    'model__n_neighbors': np.arange(2, 30, 1), 
    "model__weights": ["uniform", "distance"],
}

grid_knn = GridSearchCV(
    estimator=pipeline_knn,  
    param_grid=param_grid_knn, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=1
)

grid_knn.fit(X_train, y_train)

# ran for 14.1s very fast

## Random Forest

In [None]:
## preprocessing
# thanks to: https://medium.com/@kalpit.sharma/mastering-random-forest-hyperparameter-tuning-for-enhanced-machine-learning-models-2d1a8c6c426f

# no need to scale numerical features
preprocess_rf = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

pipeline_rf = Pipeline([
    ("preprocess", preprocess_rf),
    ("model", RandomForestClassifier(random_state=SEED))
])

# notice that n=1000 and sqrt(1000) ~= 31, there are 20 features.
param_grid_rf = {
    'model__n_estimators': [50, 100, 150],
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2']
}

grid_rf = GridSearchCV(
    estimator=pipeline_rf,  
    param_grid=param_grid_rf, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=1
)

grid_rf.fit(X_train, y_train)

# Fitting 5 folds for each of 432 candidates, totalling 2160 fits
# ran for 11m 49.6s

# XGBoost

Docs:
https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [None]:
## preprocessing
# thanks to: https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning
#       https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html
#       https://xgboost.readthedocs.io/en/stable/parameter.html 
#       https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#h-learning-task-parameters

# no need to scale numerical features
preprocess_xgb = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

pipeline_xgb = Pipeline([
    ("preprocess", preprocess_xgb),
    ("model", XGBClassifier(random_state=SEED))
])

param_grid_xgb = {
    'model__n_estimators': [50, 100, 150],
    'model__gamma': [0, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 1.0]
}

grid_xgb = GridSearchCV(
    estimator=pipeline_xgb,  
    param_grid=param_grid_xgb, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=2
)

grid_xgb.fit(X_train, y_train)

# Fitting 5 folds for each of 729 candidates, totalling 3645 fits
# ran for 6m 6.5s

## Naive Bayes

Our data is a mix of both categorical and numerical so we cannot correctly out of the box use GaussianNB or CategoricalNB. Our dataset has more categorical covariates than numerical (13 > 7), so we choose to use CategoricalNB with encoding.

- with help from https://scikit-learn.org/stable/modules/naive_bayes.html

In [None]:
# bin numeric features into discrete categories
# use OrdinalEncoder() for categorical bc CategoricalNB() doesn't consider order
preprocess_nb = ColumnTransformer(
    transformers=[
        ('num', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile'), numeric_features),
        ('cat', OrdinalEncoder(), categorical_features)
    ]
)

pipeline_nb = Pipeline([
    ('preprocess', preprocess_nb),
    ('model', CategoricalNB())
])

param_grid_nb = {
    'model__alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0],
    'model__min_categories': [0, 1, 2, 3, 4, 5]
}

grid_nb = GridSearchCV(
    estimator=pipeline_nb,  
    param_grid=param_grid_nb, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=1
)

grid_nb.fit(X_train, y_train)

## SVM

In [None]:
preprocess_svm = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

pipeline_svm = Pipeline([
    ('preprocess', preprocess_svm),
    ('model', SVC(random_state=SEED))
])

param_grid_svm = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__degree': [2, 3, 4, 5],
    'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'model__gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(
    estimator=pipeline_svm,  
    param_grid=param_grid_svm, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=1
)

grid_svm.fit(X_train, y_train)

## Neural networks

In [None]:
preprocess_nn = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

pipeline_nn = Pipeline([
    ('preprocess', preprocess_nn),
    ('model', MLPClassifier(random_state=SEED))
])

param_grid_nn = {
    'model__hidden_layer_sizes': [(50,), (100,), (50,50), (100,50), (100,100)],
    'model__activation': ['relu', 'tanh', 'logistic'],
    'model__solver': ['adam', 'sgd'],
    'model__alpha': [0.0001, 0.001, 0.01, 0.1],       # L2 regularization
    'model__learning_rate': ['constant', 'adaptive'],
    'model__learning_rate_init': [0.001, 0.01, 0.1]
}

grid_nn = GridSearchCV(
    estimator=pipeline_nn,  
    param_grid=param_grid_nn, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=1
)

grid_nn.fit(X_train, y_train)

## Save models

In [None]:
# save models
os.makedirs("models", exist_ok=True)

models = {
    "logistic_regression": grid_lr,
    "knn": grid_knn,
    "random_forest": grid_rf,
    "xgboost": grid_xgb,
    "naive_bayes": grid_nb,
    "svm": grid_svm,
    "neural_network": grid_nn
}

for name, model in models.items():
    path = f"saved_models/{name}.pkl"
    joblib.dump(model, path)
    print(f"Saved {name} to {path}")