# STAT 441: Final Project

Author: Jessica Lu

Dataset: https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data

This dataset comes courtesy from the UC Irvine Machine Learning Repository.

---

Models:
- Logistic regression
- kNN
- Random Forest
- XGBoost

---


## Preprocessing

We download the dataset from the internet. The dataset comes in a both a numeric-only and numeric+categorical format. The data file is in an unconventional format, so we convert both to a .csv. We add an id column to the .csv file.

Instead of getting the dataset like this:

```
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets 
```

I have downloaded the data file from the website and wrote to a csv file. I chose meaningful feature names, as opposed to "Attribute#". I also added an id column. The dataset is assumed to be located relatively at `./datasets/german.csv`.


In [20]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score

SEED = 42
np.random.seed(SEED)

## Pre-preprocessing (shared code for all models)

In [27]:
df = pd.read_csv("./datasets/german.csv")

# map credit_risk to 0/1 (good/bad)
y = df["credit_risk"].map({1: 0, 2: 1})

# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
X = df.drop(columns=["credit_risk", "id"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

In [None]:
## tmp: eda

y_train.value_counts() # 560/240
y_test.value_counts() #140/60

# not imbalanced so we can use accuracy

credit_risk
0    560
1    240
Name: count, dtype: int64

## Shared code for all models

In [22]:
n = X.shape[0] # 1000 instances


# used in transforming the data with ColumnTransformer()
numeric_features = X.select_dtypes(include="number").columns
categorical_features = X.select_dtypes(exclude="number").columns


# using same scoring metrics across all four models
metrics = ["accuracy", "f1", "precision", "recall", "roc_auc"]
scoring = {
    "accuracy": "accuracy",
    "f1": make_scorer(f1_score, zero_division=0),
    "precision": make_scorer(precision_score, zero_division=0),
    "recall": make_scorer(recall_score, zero_division=0),
    "roc_auc": "roc_auc"
}


# wrapper function to print  best parameter set for each metric.
#   called after training
def print_best_params_by_metric(cv_results, metrics):
    """
    Outputs best parameter set for each metric.
    """
    for metric in metrics:
        best_index = cv_results[f"mean_test_{metric}"].argmax()
        best_params = {k.replace("param_model__", ""): v[best_index] 
                       for k, v in cv_results.items()
                        if k.startswith("param_model__")}
        best_score = cv_results[f"mean_test_{metric}"][best_index]
        print(f"Best params for {metric}: {best_params}")
        print(f"Best CV {metric}: {best_score}")
        print("------")


## Logistic regression

np.logspace(-4,4,20) = (array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
        4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
        2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
        1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
        5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),)

In [23]:
## preprocessing
# thanks to: https://www.youtube.com/watch?v=tIO8zPCdi58
# thanks to: https://www.geeksforgeeks.org/machine-learning/how-to-optimize-logistic-regression-performance/

preprocess_lr = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

pipeline_lr = Pipeline([
    ("preprocess", preprocess_lr),
    ("model", LogisticRegression(random_state=SEED))
])

# "The main hyperparameters we may tune in logistic regression are: solver, penalty, and regularization strength (sklearn documentation)."
#   from: https://medium.com/codex/do-i-need-to-tune-logistic-regression-hyperparameters-1cb2b81fca69
# "Logistic regression does not really have any critical hyperparameters to tune."
#   from: https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
param_grid_lr = {
    # 'model__penalty': ["l1", "l2", "elasticnet"], # got thrown issue for this so
    'model__C': np.logspace(-4,4,20),
    "model__solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
    "model__max_iter": [100,1000,2500,5000]
}

grid_lr = GridSearchCV(
    estimator=pipeline_lr,  
    param_grid=param_grid_lr, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=1
)

grid_lr.fit(X_train, y_train)

# Fitting 5 folds for each of 480 candidates, totalling 2400 fits
# ran for like 4m...

Fitting 5 folds for each of 480 candidates, totalling 2400 fits




0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__C': array([1.0000...00000000e+04]), 'model__max_iter': [100, 1000, ...], 'model__solver': ['lbfgs', 'liblinear', ...]}"
,scoring,"{'accuracy': 'accuracy', 'f1': make_scorer(f...ro_division=0), 'precision': make_scorer(p...ro_division=0), 'recall': make_scorer(r...ro_division=0), ...}"
,n_jobs,
,refit,'accuracy'
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(0.615848211066026)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [26]:
cv_results_lr = grid_lr.cv_results_
print_best_params_by_metric(cv_results_lr, metrics)

# Best params for accuracy: {'C': np.float64(0.615848211066026), 'max_iter': np.int64(100), 'solver': 'lbfgs'}
# Best CV accuracy: 0.75125
# ------
# Best params for f1: {'C': np.float64(10000.0), 'max_iter': np.int64(100), 'solver': 'sag'}
# Best CV f1: 0.5252145534249706
# ------
# Best params for precision: {'C': np.float64(0.004832930238571752), 'max_iter': np.int64(100), 'solver': 'liblinear'}
# Best CV precision: 0.7171428571428571
# ------
# Best params for recall: {'C': np.float64(78.47599703514607), 'max_iter': np.int64(100), 'solver': 'newton-cg'}
# Best CV recall: 0.4791666666666667
# ------
# Best params for roc_auc: {'C': np.float64(0.615848211066026), 'max_iter': np.int64(100), 'solver': 'newton-cholesky'}
# Best CV roc_auc: 0.7853050595238096
# ------

Best params for accuracy: {'C': np.float64(0.615848211066026), 'max_iter': np.int64(100), 'solver': 'lbfgs'}
Best CV accuracy: 0.75125
------
Best params for f1: {'C': np.float64(78.47599703514607), 'max_iter': np.int64(100), 'solver': 'newton-cg'}
Best CV f1: 0.5249815201053011
------
Best params for precision: {'C': np.float64(0.004832930238571752), 'max_iter': np.int64(100), 'solver': 'liblinear'}
Best CV precision: 0.7171428571428571
------
Best params for recall: {'C': np.float64(78.47599703514607), 'max_iter': np.int64(100), 'solver': 'newton-cg'}
Best CV recall: 0.4791666666666667
------
Best params for roc_auc: {'C': np.float64(0.615848211066026), 'max_iter': np.int64(100), 'solver': 'newton-cholesky'}
Best CV roc_auc: 0.7853050595238096
------


## kNN

In [28]:
## preprocessing
# thanks to: https://medium.com/@agrawalsam1997/hyperparameter-tuning-of-knn-classifier-a32f31af25c7
#   for this: np.arange(2, 30, 1)

# i purposely don't drop="first"
preprocess_knn = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

pipeline_knn = Pipeline([
    ("preprocess", preprocess_knn),
    ("model", KNeighborsClassifier())
])

# notice that n=1000 and sqrt(1000) ~= 31
param_grid_knn = {
    'model__n_neighbors': np.arange(2, 30, 1), 
    "model__weights": ["uniform", "distance"],
}

grid_knn = GridSearchCV(
    estimator=pipeline_knn,  
    param_grid=param_grid_knn, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=1
)

grid_knn.fit(X_train, y_train)

# ran for 14.1s very fast

Fitting 5 folds for each of 56 candidates, totalling 280 fits


0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'model__n_neighbors': array([ 2, 3..., 27, 28, 29]), 'model__weights': ['uniform', 'distance']}"
,scoring,"{'accuracy': 'accuracy', 'f1': make_scorer(f...ro_division=0), 'precision': make_scorer(p...ro_division=0), 'recall': make_scorer(r...ro_division=0), ...}"
,n_jobs,
,refit,'accuracy'
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_neighbors,np.int64(10)
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [29]:
cv_results_knn = grid_knn.cv_results_
print_best_params_by_metric(cv_results_knn, metrics)

# Best params for accuracy: {'n_neighbors': np.int64(10), 'weights': 'distance'}
# Best CV accuracy: 0.7425
# ------
# Best params for f1: {'n_neighbors': np.int64(3), 'weights': 'uniform'}
# Best CV f1: 0.45962787039846775
# ------
# Best params for precision: {'n_neighbors': np.int64(25), 'weights': 'distance'}
# Best CV precision: 0.8350000000000002
# ------
# Best params for recall: {'n_neighbors': np.int64(2), 'weights': 'distance'}
# Best CV recall: 0.39583333333333337
# ------
# Best params for roc_auc: {'n_neighbors': np.int64(24), 'weights': 'distance'}
# Best CV roc_auc: 0.75859375
# ------

Best params for accuracy: {'n_neighbors': np.int64(10), 'weights': 'distance'}
Best CV accuracy: 0.7425
------
Best params for f1: {'n_neighbors': np.int64(3), 'weights': 'uniform'}
Best CV f1: 0.45962787039846775
------
Best params for precision: {'n_neighbors': np.int64(25), 'weights': 'distance'}
Best CV precision: 0.8350000000000002
------
Best params for recall: {'n_neighbors': np.int64(2), 'weights': 'distance'}
Best CV recall: 0.39583333333333337
------
Best params for roc_auc: {'n_neighbors': np.int64(24), 'weights': 'distance'}
Best CV roc_auc: 0.75859375
------


In [39]:
best_model_knn = grid_knn.best_estimator_

y_pred = best_model_knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.75

## Random Forest

In [None]:
## preprocessing
# thanks to: https://medium.com/@kalpit.sharma/mastering-random-forest-hyperparameter-tuning-for-enhanced-machine-learning-models-2d1a8c6c426f

# no need to scale numerical features
preprocess_rf = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

pipeline_rf = Pipeline([
    ("preprocess", preprocess_rf),
    ("model", RandomForestClassifier(random_state=SEED))
])

# notice that n=1000 and sqrt(1000) ~= 31, there are 20 features.
param_grid_rf = {
    'model__n_estimators': [50, 100, 150],
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2']
}

grid_rf = GridSearchCV(
    estimator=pipeline_rf,  
    param_grid=param_grid_rf, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=1
)

grid_rf.fit(X_train, y_train)

# Fitting 5 folds for each of 432 candidates, totalling 2160 fits
# ran for 11m 49.6s

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'model__criterion': ['gini', 'entropy'], 'model__max_depth': [None, 10, ...], 'model__max_features': ['sqrt', 'log2'], 'model__min_samples_leaf': [1, 2, ...], ...}"
,scoring,"{'accuracy': 'accuracy', 'f1': make_scorer(f...ro_division=0), 'precision': make_scorer(p...ro_division=0), 'recall': make_scorer(r...ro_division=0), ...}"
,n_jobs,
,refit,'accuracy'
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'entropy'
,max_depth,30
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
cv_results_rf = grid_rf.cv_results_
print_best_params_by_metric(cv_results_rf, metrics)

# Best params for accuracy: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(10), 'n_estimators': np.int64(100)}
# Best CV accuracy: 0.7775
# ------
# Best params for f1: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(10), 'n_estimators': np.int64(100)}
# Best CV f1: 0.5242081346803799
# ------
# Best params for precision: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': np.int64(4), 'min_samples_split': np.int64(5), 'n_estimators': np.int64(50)}
# Best CV precision: 0.8228588405058993
# ------
# Best params for recall: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(10), 'n_estimators': np.int64(100)}
# Best CV recall: 0.4125
# ------
# Best params for roc_auc: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(5), 'n_estimators': np.int64(50)}
# Best CV roc_auc: 0.8017113095238095
# ------

Best params for accuracy: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(10), 'n_estimators': np.int64(100)}
Best CV accuracy: 0.7775
------
Best params for f1: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(10), 'n_estimators': np.int64(100)}
Best CV f1: 0.5242081346803799
------
Best params for precision: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': np.int64(4), 'min_samples_split': np.int64(5), 'n_estimators': np.int64(50)}
Best CV precision: 0.8228588405058993
------
Best params for recall: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': np.int64(1), 'min_samples_split': np.int64(10), 'n_estimators': np.int64(100)}
Best CV recall: 0.4125
------
Best params for roc_auc: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'log2', 'min_samp

# XGBoost

Docs:
https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [None]:
## preprocessing
# thanks to: https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning
#       https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html
#       https://xgboost.readthedocs.io/en/stable/parameter.html 
#       https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#h-learning-task-parameters

# no need to scale numerical features
preprocess_xgb = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

pipeline_xgb = Pipeline([
    ("preprocess", preprocess_xgb),
    ("model", XGBClassifier(random_state=SEED))
])

param_grid_xgb = {
    'model__n_estimators': [50, 100, 150],
    'model__gamma': [0, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 1.0]
}

grid_xgb = GridSearchCV(
    estimator=pipeline_xgb,  
    param_grid=param_grid_xgb, 
    cv=5,                    
    scoring=scoring,
    refit="accuracy",
    verbose=2
)

grid_xgb.fit(X_train, y_train)

# Fitting 5 folds for each of 729 candidates, totalling 3645 fits
# ran for 6m 6.5s

In [30]:
cv_results_xgb = grid_xgb.cv_results_
print_best_params_by_metric(cv_results_xgb, metrics)

# Best params for accuracy: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.1), 'max_depth': np.int64(3), 'min_child_weight': np.int64(1), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
# Best CV accuracy: 0.765
# ------
# Best params for f1: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.1), 'max_depth': np.int64(3), 'min_child_weight': np.int64(1), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
# Best CV f1: 0.5708481307518998
# ------
# Best params for precision: {'colsample_bytree': np.float64(0.7), 'gamma': np.float64(0.0), 'max_depth': np.int64(7), 'min_child_weight': np.int64(1), 'n_estimators': np.int64(150), 'subsample': np.float64(0.7)}
# Best CV precision: 0.6354906231094979
# ------
# Best params for recall: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.1), 'max_depth': np.int64(7), 'min_child_weight': np.int64(3), 'n_estimators': np.int64(50), 'subsample': np.float64(0.8)}
# Best CV recall: 0.5250000000000001
# ------
# Best params for roc_auc: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.2), 'max_depth': np.int64(3), 'min_child_weight': np.int64(1), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
# Best CV roc_auc: 0.79140625
# ------

Best params for accuracy: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.1), 'max_depth': np.int64(3), 'min_child_weight': np.int64(1), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
Best CV accuracy: 0.765
------
Best params for f1: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.1), 'max_depth': np.int64(3), 'min_child_weight': np.int64(1), 'n_estimators': np.int64(50), 'subsample': np.float64(0.7)}
Best CV f1: 0.5708481307518998
------
Best params for precision: {'colsample_bytree': np.float64(0.7), 'gamma': np.float64(0.0), 'max_depth': np.int64(7), 'min_child_weight': np.int64(1), 'n_estimators': np.int64(150), 'subsample': np.float64(0.7)}
Best CV precision: 0.6354906231094979
------
Best params for recall: {'colsample_bytree': np.float64(0.8), 'gamma': np.float64(0.1), 'max_depth': np.int64(7), 'min_child_weight': np.int64(3), 'n_estimators': np.int64(50), 'subsample': np.float64(0.8)}
Best CV recall: 0.5250000000000001
------
Best params fo