# German Credit dataset

## Contents

4. Build a baseline model
5. Prepare the data to better expose the underlying patterns to machine learning algorithm (incl feature engineering)
6. Explore many modesl; Select a model and train it
7. Fine-tune the model
8. Present your solution
9. Deploy, monitor and maintain your system



##### TODO
- Ensemble model?
- Deploy


## The metric: f2

<br>

### Imports

In [68]:
# imports from Python Standard Library
import re, warnings

from collections import Counter

In [69]:
# Third party imports
import numpy as np
import pandas as pd


In [70]:
# sklearn imports
from sklearn.metrics import (accuracy_score, recall_score, precision_score, fbeta_score, roc_auc_score, classification_report)
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector


In [71]:
# Custom utilities imports
from src.helper_utilities import load_data
from src.modeling_utilities import Baseline, classification_scores, f2

In [72]:
# Settings
#warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)

### Load the data

In [78]:
# Get the (user-friendly) data for a baseline model
df = load_data(mode='analysis', format='dataframe')

# Save the "user friendly" dataframe for EDA as csv
df.to_csv("data/user_friendly_cats.csv", index=False)

# get the data from the saved csv due to the pd quirk with Ctegoricals
df = pd.read_csv("data/user_friendly_cats.csv")
df.head()


Unnamed: 0,tenure,amount,rate,residence,age,credits,maintenance,history,savings,employment,...,status,purpose,guarantor,installments,housing,telephone,foreign,sex,personal,label
0,6,1169,4,4,67,2,1,critical,no savings,"[7, inf)",...,overdrawn,television,none,none,ownership,yes,True,male,male single,0
1,48,5951,2,2,22,1,1,so far so good,"[0, 100)","[1, 4)",...,petty,television,none,none,ownership,none,True,female,female divorced/separated/married,1
2,12,2096,2,3,49,1,2,critical,"[0, 100)","[4, 7)",...,no account,education,none,none,ownership,none,True,male,male single,0
3,42,7882,2,4,45,1,2,so far so good,"[0, 100)","[4, 7)",...,overdrawn,furniture,guarantor,none,without payment,none,True,male,male single,0
4,24,4870,3,4,53,2,2,delay,"[0, 100)","[1, 4)",...,overdrawn,car,none,none,without payment,none,True,male,male single,1


# 4. Baseline model

This baseline model is based on a simple lookup table approach. You can view the code here:
[src/modeling_utilities.py](src/modeling_utilities.py)

In [80]:
# Train Test Split
features = df.copy()
labels = features.pop('label')
Xtrain, Xtest, ytrain, ytest = train_test_split(features, labels, stratify=labels, test_size=0.2, random_state=None)

In [81]:
# This baseline model is based on a simple lookup table approach
baseline = Baseline(best_features=['status', 'history', 'savings'], threshold=0.5)
baseline.fit(Xtrain, ytrain)

In [90]:
# Cross validation F2 score (on the whole dataset; with the default threshold of 0.5)
print("F2 =", cross_val_score(baseline, Xtrain, ytrain, scoring=f2, cv=5).mean().round(2))

F2 = 0.52


In [83]:
# The default threshold of 0.5 givs us the following results on the test set:
ypred = baseline.predict(Xtest)
classification_scores(ytest, ypred)

accuracy     0.72
precision    0.53
recall       0.45
f1           0.49
f2           0.46
dtype: float64

In [89]:
# AUC
ytrue = ytest
yscore = baseline.predict_proba(Xtest)
print("AUC =", roc_auc_score(ytrue, yscore).round(2))

AUC = 0.67


In [88]:
# Hyperparameter grid search: the best model's threshold is 0.125 and has the F2 = 0.71
gs = GridSearchCV(baseline, {'threshold': np.linspace(0.05, 0.2, num=7)}, cv=5, scoring=f2).fit(Xtrain, ytrain)
print("threshold =", gs.best_estimator_.threshold, "\tF2 =", gs.best_score_.round(2))

threshold = 0.2 F2 = 0.69


So, the goal is to beat the F2-score (and possibly the AUC)

<br>

# 5. Data Preprocessing

### Note how the features are ordered in the original dataset

In [12]:
# View the attribute names from the info document
path = 'data/german.doc'

with open(path, mode='r') as file:
    text = file.read()
    
pattern = r"Attr?ibute (?P<attr>\d{1,2}):.+?\n\s+(?P<name>.+?)\n"

print('\033[91m{}\033[0m'.format("Column index,"),  "original feature name and", '\033[92m{}\033[0m'.format("my short name"), end="\n\n")

# make a mapping from the "handy" name to the actual column index
column_index = dict()

for m in re.finditer(pattern, text):
    possible_names = [s for s in df.columns for pattern in (fr"\b{s}\b", fr"\b{s[:-1]}")
                      if re.search(pattern, m.groupdict()['name'], re.IGNORECASE)] or ['tenure']
    my_column_name = Counter(sorted(possible_names)).most_common(1)[0][0]
    print('\033[91m{}\033[0m'.format(int(m.groupdict()['attr'])-1), f"{m.groupdict()['name'].strip()}", '\033[92m({})\033[0m'.format(my_column_name))
    column_index[my_column_name] = int(m.groupdict()['attr'])-1


[91mColumn index,[0m original feature name and [92mmy short name[0m

[91m0[0m Status of existing checking account [92m(status)[0m
[91m1[0m Duration in month [92m(tenure)[0m
[91m2[0m Credit history [92m(history)[0m
[91m3[0m Purpose [92m(purpose)[0m
[91m4[0m Credit amount [92m(amount)[0m
[91m5[0m Savings account/bonds [92m(savings)[0m
[91m6[0m Present employment since [92m(employment)[0m
[91m7[0m Installment rate in percentage of disposable income [92m(rate)[0m
[91m8[0m Personal status and sex [92m(personal)[0m
[91m9[0m Other debtors / guarantors [92m(guarantor)[0m
[91m10[0m Present residence since [92m(residence)[0m
[91m11[0m Property [92m(property)[0m
[91m12[0m Age in years [92m(age)[0m
[91m13[0m Other installment plans [92m(installments)[0m
[91m14[0m Housing [92m(housing)[0m
[91m15[0m Number of existing credits at this bank [92m(credits)[0m
[91m16[0m Job [92m(job)[0m
[91m17[0m Number of people being liable to pr

### Load the original dataset

In [110]:
# load as ndarray
X, y = load_data(mode='modeling', format='ndarray')

# load as df
features, labels = load_data(mode='modeling', format='dataframe')
df_X, sr_y = load_data(mode='modeling', format='dataframe')


Which format to expect as input into our pipeline?

We'll opt for DataFrame because we will want to determine feature data types dynamically with pandas' functionality.

### Train Test Split (stratify=y)

In [111]:
Xtrain, Xtest, ytrain, ytest = train_test_split(features, labels, stratify=labels, test_size=0.2)
Xtest[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
373,A14,60,A34,A40,13756,A65,A75,2,A93,A101,4,A124,63,A141,A153,1,A174,1,A192,A201
150,A14,6,A32,A43,1346,A62,A75,2,A93,A101,4,A124,42,A141,A153,1,A173,2,A192,A201
433,A14,24,A34,A45,2058,A61,A73,4,A91,A101,2,A121,33,A143,A152,2,A173,1,A192,A201


### Determine the features which may be excluded from our model

According to our statistical tests earlier these features are useless: ['residence', 'job', 'credits', 'telephone', 'maintenance']

In [15]:
# sorted bad to worst
weak_features = ['residence', 'job', 'credits', 'telephone', 'maintenance']
features_to_keep = sorted(set(range(X.shape[1])) - set(column_index[k] for k in weak_features))
features_to_keep

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 19]

# Shortlist promissing models

metric: F2 (cross validation on the train set)

In [112]:
# Decision Tree / Random Forest

ct = ColumnTransformer([
        ("num", 'passthrough', make_column_selector(dtype_include=np.number)),
        ("cat", 
        OneHotEncoder(drop='if_binary', sparse_output=False, handle_unknown='ignore'), 
        make_column_selector(dtype_include=['category', object]))
], remainder='passthrough')

cl = DecisionTreeClassifier()  # RF performs even worse here
md = make_pipeline(ct, cl)

cross_validation_scores = cross_val_score(md, features, labels, scoring=f2, cv=5)
print("F2:", cross_validation_scores.round(2), "\tmean:", cross_validation_scores.mean().round(2), "\tstd:", cross_validation_scores.std().round(4))


F2: [0.52 0.54 0.45 0.55 0.43] 	mean: 0.5 	std: 0.0501


In [114]:
# Gradient Boosting
...



### Prepare the data




3. Encode, dummify categorical features. Maybe "numerize" categorical features with too many categories.

4. Feature engineering, where appropriate:
    - Discretize continuous features.
    - Decompose features (e.g., categorical, date/time, etc.)  SEX
    - Add promising transformations of features (e.g., log(x), sqrt(x), x^2, etc.).
    - Aggregate features into promising new features.

5. Feature scaling: standardize or normalize features.


### Checklist:
https://github.com/ageron/handson-ml3/blob/main/ml-project-checklist.md




### ML algs

# SVM:
Support Vector Machine (SVM) classifiers can work well with a mix of numerical features (even if they are not normally distributed) and categorical features (encoded as one-hot features). However, there are some considerations to keep in mind:

Feature Scaling: SVMs are sensitive to the scale of features. It's generally a good idea to scale your numerical features, especially if they are on different scales or not normally distributed. Techniques like StandardScaler or MinMaxScaler can be used.

One-Hot Encoding: When you one-hot encode categorical variables, you increase the dimensionality of your feature space. This can be fine for SVMs, but it can lead to increased computational complexity, especially if you have a large number of one-hot encoded features.

Kernel Selection: SVMs use a kernel trick to transform the input space into a higher-dimensional space where it's easier to separate classes. The choice of kernel can impact the model's performance. The linear kernel is a good choice for high-dimensional data, but you might also consider other kernels like Radial Basis Function (RBF) kernel.

Outliers: SVMs can be sensitive to outliers, especially if you're using a linear kernel. Make sure to handle outliers appropriately, especially for features that are not normally distributed.


    



### TODO


    # Create LightGBM dataset for training
    train_data = lgb.Dataset(X_train, label=y_train)

    # Set parameters for LightGBM
    params = {
        'objective': 'binary',  # Assuming binary classification, change if needed
        'metric': 'binary_error',  # Evaluation metric
        'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
        'num_leaves': 31,  # Maximum number of leaves in a tree
        'learning_rate': 0.1,  # Learning rate
        'feature_fraction': 0.8,  # Randomly select a fraction of features
        'bagging_fraction': 0.8,  # Randomly select a fraction of data
        'bagging_freq': 5,  # Frequency for bagging
        'verbose': 0  # Suppress output
    }

    # Train the model
    num_round = 100  # Number of boosting rounds
    bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10, verbose_eval=10)


## Models to try to shortlist promissing ones:
- DONE: DecisionTree / RF: integer encoding, or rather one-hot
- LightGBM:
- try GaussianNB on the logged num feats: log afterwards standerdize
- try CategoricalNB on cat feats
- SVM: num: standerdize; cat: dummify
- ANN

- sex column trans (optional)
- ensemble




In [None]:
# TREEE

import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Assume X contains categorical features
# y contains the target variable

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Set parameters
params = {
    'objective': 'binary',
    'metric': 'binary_error'
}

# Train the model
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10)

# Make predictions
predictions = bst.predict(X_test, num_iteration=bst.best_iteration)

# Convert probabilities to binary predictions
threshold = 0.5
binary_predictions = [1 if pred > threshold else 0 for pred in predictions]

# Evaluate the model
accuracy = accuracy_score(y_test, binary_predictions)
