# STAT 441: Final Project

Author: Jessica Lu

Dataset: https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data

This dataset comes courtesy from the UC Irvine Machine Learning Repository.

---

Models:
- Logistic regression
- kNN
- Random Forest
- XGBoost

---


## Preprocessing

We download the dataset from the internet. The dataset comes in a both a numeric-only and numeric+categorical format. The data file is in an unconventional format, so we convert both to a .csv. We add an id column to the .csv file.

Instead of getting the dataset like this:

```
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets 
```

I have downloaded the data file from the website and wrote to a csv file. I chose meaningful feature names, as opposed to "Attribute#". I also added an id column. The dataset is assumed to be located relatively at `./datasets/german.csv`.


In [None]:
import numpy as np
import pandas as pd
import joblib
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

SEED = 42
np.random.seed(SEED)

## Pre-preprocessing (shared code for all models)

In [None]:
df = pd.read_csv("./datasets/german.csv")

# map credit_risk to 0/1 (good/bad)
y = df["credit_risk"].map({1: 0, 2: 1})

# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
X = df.drop(columns=["credit_risk", "id"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

## Exploratory data analysis

In [None]:
# check for data imbalance? 
print(y_train.value_counts())   # 560/240
print(y_test.value_counts())    # 140/60

# we find that our data is not imbalanced so we can use accuracy as a metric

## Shared code for all models

In [None]:
n = X.shape[0] # 1000 instances

# used in transforming the data with ColumnTransformer()
numeric_features = X.select_dtypes(include="number").columns
categorical_features = X.select_dtypes(exclude="number").columns

In [None]:
## output best params found in hyperparameter tuning
# use cost matrix provided from https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data
def cost_score(y_true, y_pred):
    cost_matrix = np.array([[0, 1],
                        [5, 0]])
    total_cost = sum(cost_matrix[t-1, p-1] for t, p in zip(y_true, y_pred))
    return total_cost

# using same scoring metrics across all four models
metrics = ["cost", "accuracy", "f1", "precision", "recall", "roc_auc"]

scoring = {
    "accuracy": "accuracy",
    "f1": make_scorer(f1_score, zero_division=0),
    "precision": make_scorer(precision_score, zero_division=0),
    "recall": make_scorer(recall_score, zero_division=0),
    "roc_auc": "roc_auc",
    "cost": make_scorer(cost_score, greater_is_better=False)
}

# wrapper function to print best parameter set for each metric.
#   called after training
def print_best_params_by_metric(cv_results, metrics, model_name="model"):
    """
    Outputs best parameter set for each metric.
    Corrects sign for the 'cost' metric, because greater_is_better=False
    means scikit-learn internally stores negative costs.
    """
    print(f"For {model_name}:")
    for metric in metrics:
        best_index = cv_results[f"mean_test_{metric}"].argmax()

        best_params = {
            k.replace("param_model__", ""): v[best_index]
            for k, v in cv_results.items()
            if k.startswith("param_model__")
        }

        # raw CV score (for cost, this is negative)
        best_score = cv_results[f"mean_test_{metric}"][best_index]
        
        # flip sign for cost
        if metric == "cost":
            corrected = -best_score
            print(f"Best params for {metric}: {best_params}")
            print(f"Best CV {metric}: {corrected}  (corrected from {best_score})")
        else:
            print(f"Best params for {metric}: {best_params}")
            print(f"Best CV {metric}: {best_score}")
        print()
    print("------")

In [None]:
## output test metrics for best fit model
def print_test_metrics(model, model_name="model"):
    """
    Prints test set metrics for a fitted model in a simple key-value style.
    
    Parameters:
    - model: fitted sklearn pipeline or classifier
    - model_name: string for labeling output
    """
    best_params = model.best_params_
    
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_prob),
        "cost": cost_score(y_test, y_pred)
    }
    
    print(f"Best parameters for {model_name}: {best_params}")
    for metric, value in metrics.items():
        print(f"Test {metric}: {value}")
    print("------")

## Logistic regression

np.logspace(-4,4,20) = (array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
        4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
        2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
        1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
        5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),)

In [None]:
## preprocessing
# thanks to: https://www.youtube.com/watch?v=tIO8zPCdi58
# thanks to: https://www.geeksforgeeks.org/machine-learning/how-to-optimize-logistic-regression-performance/

preprocess_lr = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

pipeline_lr = Pipeline([
    ("preprocess", preprocess_lr),
    ("model", LogisticRegression(class_weight="balanced",
                                 random_state=SEED))
])

# "The main hyperparameters we may tune in logistic regression are: solver, penalty, and regularization strength (sklearn documentation)."
#   from: https://medium.com/codex/do-i-need-to-tune-logistic-regression-hyperparameters-1cb2b81fca69
# "Logistic regression does not really have any critical hyperparameters to tune."
#   from: https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

param_grid_lr = [
    {'model__penalty': ['l1'], 'model__solver': ['liblinear', 'saga'], 'model__C': np.logspace(-4, 4, 20), "model__max_iter": [100,1000,2500,5000]},
    {'model__penalty': ['l2'], 'model__solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'], 'model__C': np.logspace(-4, 4, 20), "model__max_iter": [100,1000,2500,5000]},
    {'model__penalty': ['elasticnet'], 'model__solver': ['saga'], 'model__C': np.logspace(-4, 4, 20), 'model__l1_ratio': [0.25, 0.5, 0.75], "model__max_iter": [100,1000,2500,5000]}
]

grid_lr = GridSearchCV(
    estimator=pipeline_lr,  
    param_grid=param_grid_lr, 
    cv=5,                    
    scoring=scoring,
    refit="cost",
    verbose=1
)

grid_lr.fit(X_train, y_train)

# Fitting 5 folds for each of 480 candidates, totalling 2400 fits
# ran for like 4m...

## kNN

In [None]:
## preprocessing
# thanks to: https://medium.com/@agrawalsam1997/hyperparameter-tuning-of-knn-classifier-a32f31af25c7
#   for this: np.arange(2, 30, 1)

# i purposely don't drop="first"
preprocess_knn = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

pipeline_knn = Pipeline([
    ("preprocess", preprocess_knn),
    ("model", KNeighborsClassifier())
])

# notice that n=1000 and sqrt(1000) ~= 31
param_grid_knn = {
    'model__n_neighbors': np.arange(2, 30, 1), 
    "model__weights": ["uniform", "distance"],
}

grid_knn = GridSearchCV(
    estimator=pipeline_knn,  
    param_grid=param_grid_knn, 
    cv=5,                    
    scoring=scoring,
    refit="cost",
    verbose=1
)

grid_knn.fit(X_train, y_train)

# ran for 14.1s very fast

In [None]:
# Best params for accuracy: {'n_neighbors': np.int64(10), 'weights': 'distance'}
# Best CV accuracy: 0.7425
# ------
# Best params for f1: {'n_neighbors': np.int64(3), 'weights': 'uniform'}
# Best CV f1: 0.45962787039846775
# ------
# Best params for precision: {'n_neighbors': np.int64(25), 'weights': 'distance'}
# Best CV precision: 0.8350000000000002
# ------
# Best params for recall: {'n_neighbors': np.int64(2), 'weights': 'distance'}
# Best CV recall: 0.39583333333333337
# ------
# Best params for roc_auc: {'n_neighbors': np.int64(24), 'weights': 'distance'}
# Best CV roc_auc: 0.75859375
# ------

## Random Forest

In [None]:
## preprocessing
# thanks to: https://medium.com/@kalpit.sharma/mastering-random-forest-hyperparameter-tuning-for-enhanced-machine-learning-models-2d1a8c6c426f

# no need to scale numerical features
preprocess_rf = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

pipeline_rf = Pipeline([
    ("preprocess", preprocess_rf),
    ("model", RandomForestClassifier(random_state=SEED))
])

# notice that n=1000 and sqrt(1000) ~= 31, there are 20 features.
param_grid_rf = {
    'model__n_estimators': [50, 100, 150],
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2']
}

grid_rf = GridSearchCV(
    estimator=pipeline_rf,  
    param_grid=param_grid_rf, 
    cv=5,                    
    scoring=scoring,
    refit="cost",
    verbose=1
)

grid_rf.fit(X_train, y_train)

# Fitting 5 folds for each of 432 candidates, totalling 2160 fits
# ran for 11m 49.6s

# XGBoost

Docs:
https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [None]:
## preprocessing
# thanks to: https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning
#       https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html
#       https://xgboost.readthedocs.io/en/stable/parameter.html 
#       https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#h-learning-task-parameters

# no need to scale numerical features
preprocess_xgb = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

pipeline_xgb = Pipeline([
    ("preprocess", preprocess_xgb),
    ("model", XGBClassifier(random_state=SEED))
])

param_grid_xgb = {
    'model__n_estimators': [50, 100, 150],
    'model__gamma': [0, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 1.0]
}

grid_xgb = GridSearchCV(
    estimator=pipeline_xgb,  
    param_grid=param_grid_xgb, 
    cv=5,                    
    scoring=scoring,
    refit="cost",
    verbose=2
)

grid_xgb.fit(X_train, y_train)

# Fitting 5 folds for each of 729 candidates, totalling 3645 fits
# ran for 6m 6.5s

In [None]:
# Best params for accuracy: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.2), 'max_depth': np.int64(7), 'min_child_weight': np.int64(3), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
# Best CV accuracy: 0.7700000000000001
# ------
# Best params for f1: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.2), 'max_depth': np.int64(7), 'min_child_weight': np.int64(3), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
# Best CV f1: 0.573550385273438
# ------
# Best params for precision: {'colsample_bytree': np.float64(0.7), 'gamma': np.float64(0.1), 'max_depth': np.int64(3), 'min_child_weight': np.int64(3), 'n_estimators': np.int64(50), 'subsample': np.float64(1.0)}
# Best CV precision: 0.6433135949248482
# ------
# Best params for recall: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.2), 'max_depth': np.int64(5), 'min_child_weight': np.int64(3), 'n_estimators': np.int64(100), 'subsample': np.float64(0.8)}
# Best CV recall: 0.5333333333333333
# ------
# Best params for roc_auc: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.1), 'max_depth': np.int64(3), 'min_child_weight': np.int64(1), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
# Best CV roc_auc: 0.7905133928571428
# ------

## Save models

In [None]:
# save models
os.makedirs("models", exist_ok=True)

models = {
    "logistic_regression": grid_lr,
    "knn": grid_knn,
    "random_forest": grid_rf,
    "xgboost": grid_xgb
}

for name, model in models.items():
    path = f"models/{name}.pkl"
    joblib.dump(model, path)
    print(f"Saved {name} to {path}")

## Evaluate models

In [None]:
## output best parameters for each model
print_best_params_by_metric(grid_lr.cv_results_, metrics, "Logistic Regression")
print_best_params_by_metric(grid_knn.cv_results_, metrics, "kNN")
print_best_params_by_metric(grid_rf.cv_results_, metrics, "Random Forest")
print_best_params_by_metric(grid_xgb.cv_results_, metrics, "XGBoost")

In [None]:
# For Logistic Regression:
# Best params for cost: {'C': np.float64(0.00026366508987303583), 'max_iter': np.int64(100), 'penalty': 'l2', 'solver': 'liblinear', 'l1_ratio': masked}
# Best CV cost: 47.8  (corrected from -47.8)

# Best params for accuracy: {'C': np.float64(0.615848211066026), 'max_iter': np.int64(100), 'penalty': 'l2', 'solver': 'lbfgs', 'l1_ratio': masked}
# Best CV accuracy: 0.75125

# Best params for f1: {'C': np.float64(4.281332398719396), 'max_iter': np.int64(1000), 'penalty': 'elasticnet', 'solver': 'saga', 'l1_ratio': np.float64(0.75)}
# Best CV f1: 0.5278343535463698

# Best params for precision: {'C': np.float64(0.004832930238571752), 'max_iter': np.int64(100), 'penalty': 'l2', 'solver': 'liblinear', 'l1_ratio': masked}
# Best CV precision: 0.7171428571428571

# Best params for recall: {'C': np.float64(78.47599703514607), 'max_iter': np.int64(100), 'penalty': 'l2', 'solver': 'newton-cg', 'l1_ratio': masked}
# Best CV recall: 0.4791666666666667

# Best params for roc_auc: {'C': np.float64(0.615848211066026), 'max_iter': np.int64(100), 'penalty': 'l2', 'solver': 'sag', 'l1_ratio': masked}
# Best CV roc_auc: 0.7853050595238096

# ------
# For kNN:
# Best params for cost: {'n_neighbors': np.int64(25), 'weights': 'distance'}
# Best CV cost: 47.8  (corrected from -47.8)

# Best params for accuracy: {'n_neighbors': np.int64(10), 'weights': 'distance'}
# Best CV accuracy: 0.7425

# Best params for f1: {'n_neighbors': np.int64(3), 'weights': 'uniform'}
# Best CV f1: 0.45962787039846775

# Best params for precision: {'n_neighbors': np.int64(25), 'weights': 'distance'}
# Best CV precision: 0.8350000000000002

# Best params for recall: {'n_neighbors': np.int64(2), 'weights': 'distance'}
# Best CV recall: 0.39583333333333337

# Best params for roc_auc: {'n_neighbors': np.int64(24), 'weights': 'distance'}
# Best CV roc_auc: 0.75859375

# ------
# For Random Forest:
# Best params for cost: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(5), 'n_estimators': np.int64(100)}
# Best CV cost: 55.2  (corrected from -55.2)

# Best params for accuracy: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(5), 'n_estimators': np.int64(50)}
# Best CV accuracy: 0.7775000000000001

# Best params for f1: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(5), 'n_estimators': np.int64(50)}
# Best CV f1: 0.5223626341081272

# Best params for precision: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(5), 'n_estimators': np.int64(100)}
# Best CV precision: 0.8001082251082252

# Best params for recall: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(5), 'n_estimators': np.int64(50)}
# Best CV recall: 0.4083333333333334

# Best params for roc_auc: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': np.int64(2), 'min_samples_split': np.int64(10), 'n_estimators': np.int64(150)}
# Best CV roc_auc: 0.7994419642857143

# ------
# For XGBoost:
# Best params for cost: {'colsample_bytree': np.float64(0.7), 'gamma': np.float64(0.0), 'max_depth': np.int64(3), 'min_child_weight': np.int64(1), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
# Best CV cost: 90.4  (corrected from -90.4)

# Best params for accuracy: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.2), 'max_depth': np.int64(7), 'min_child_weight': np.int64(3), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
# Best CV accuracy: 0.7700000000000001

# Best params for f1: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.2), 'max_depth': np.int64(7), 'min_child_weight': np.int64(3), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
# Best CV f1: 0.573550385273438

# Best params for precision: {'colsample_bytree': np.float64(0.7), 'gamma': np.float64(0.1), 'max_depth': np.int64(3), 'min_child_weight': np.int64(3), 'n_estimators': np.int64(50), 'subsample': np.float64(1.0)}
# Best CV precision: 0.6433135949248482

# Best params for recall: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.2), 'max_depth': np.int64(5), 'min_child_weight': np.int64(3), 'n_estimators': np.int64(100), 'subsample': np.float64(0.8)}
# Best CV recall: 0.5333333333333333

# Best params for roc_auc: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.1), 'max_depth': np.int64(3), 'min_child_weight': np.int64(1), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
# Best CV roc_auc: 0.7905133928571428

# ------

In [None]:
# refit each model with cost and
# output each model with test metrics
print_test_metrics(grid_lr, "Logistic Regression")
print_test_metrics(grid_knn, "kNN")
print_test_metrics(grid_rf, "Random Forest")
print_test_metrics(grid_xgb, "XGBoost")

In [None]:
# fit with cost

# Best parameters for Logistic Regression: {'model__C': np.float64(0.00026366508987303583), 'model__max_iter': 100, 'model__penalty': 'l2', 'model__solver': 'liblinear'}
# Test accuracy: 0.7
# Test f1: 0.0
# Test precision: 0.0
# Test recall: 0.0
# Test roc_auc: 0.7448809523809523
# Test cost: 60
# ------
# Best parameters for kNN: {'model__n_neighbors': np.int64(25), 'model__weights': 'distance'}
# Test accuracy: 0.76
# Test f1: 0.4146341463414634
# Test precision: 0.7727272727272727
# Test recall: 0.2833333333333333
# Test roc_auc: 0.7644047619047618
# Test cost: 68
# ------
# Best parameters for Random Forest: {'model__criterion': 'gini', 'model__max_depth': 10, 'model__max_features': 'log2', 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 100}
# Test accuracy: 0.76
# Test f1: 0.4666666666666667
# Test precision: 0.7
# Test recall: 0.35
# Test roc_auc: 0.8003571428571428
# Test cost: 84
# ------
# Best parameters for XGBoost: {'model__colsample_bytree': 0.7, 'model__gamma': 0, 'model__max_depth': 3, 'model__min_child_weight': 1, 'model__n_estimators': 50, 'model__subsample': 0.7}
# Test accuracy: 0.765
# Test f1: 0.5765765765765766
# Test precision: 0.6274509803921569
# Test recall: 0.5333333333333333
# Test roc_auc: 0.7971428571428572
# Test cost: 123
# ------
