# Setup

## Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split

## Global Configurations

In [2]:
INPUT_DIR = '../data/input/'
INPUT_PATH = f'{INPUT_DIR}german_data.csv'
OUTPUT_DIR = INPUT_DIR
EDA_DIR = '../data/EDA/'
PIPELINE_DIR = '../data/pipeline/'
OUTPUT = False
SEED = 888

In [3]:
if not os.path.exists(PIPELINE_DIR):
    os.makedirs(PIPELINE_DIR)

In [4]:
pd.options.display.max_columns = None

## Global Function

In [5]:
def display_scores(title, scores, y_test, y_pred):
    print(f'Model: {title}')
    print('Cross Validation Performance')
    print(f'CV Mean ROC AUC: {np.mean(scores):.4f}')
    print(f'CV Variance ROC AUC: {np.var(scores):.4f}')
    print('\n')

    print('Test Performance')
    print(f'Test AUC: {roc_auc_score(y_test, y_pred ):.4f}')
    print('\n')
    print(classification_report(y_test, y_pred))


def display_model_scores(model, X_train, y_train, X_test, y_test, title):
    cv = RepeatedStratifiedKFold(n_splits=10,
                                 n_repeats=5,
                                 random_state=SEED
                                 )
    scores = cross_val_score(model, X_train, y_train,
                             scoring='roc_auc', cv=cv, n_jobs=-1
                             )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    display_scores(title, scores, y_test, y_pred)

# Data

In [6]:
df = pd.read_csv(INPUT_PATH)

# dropping foreign worker columns because too less variation
cols = ['label', 'foreign_worker']
X = df.drop(cols, axis=1)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=SEED,
                                                    )

print(f'Train Bad Payer Proportion: {y_train.sum()/ y_train.shape[0]} ({y_train.sum()})')
print(f'Test Bad Payer Proportion: {y_test.sum()/ y_test.shape[0]} ({y_test.sum()})')

Train Bad Payer Proportion: 0.30142857142857143 (211)
Test Bad Payer Proportion: 0.2966666666666667 (89)


# Baseline Model

In [7]:
X_train1 = pd.get_dummies(X_train, drop_first=True)
X_test1 = pd.get_dummies(X_test, drop_first=True)
assert (X_train1.columns == X_test1.columns).all()

## Logistic Regression

### Without Balanced

In [8]:
model = LogisticRegression(random_state=SEED,
                           solver='liblinear',
                           max_iter=2000, 
                          )

display_model_scores(model, X_train1, y_train, X_test1, y_test,
                     'Logistic Regression without Balanced')

Model: Logistic Regression without Balanced
Cross Validation Performance
CV Mean ROC AUC: 0.7845
CV Variance ROC AUC: 0.0027


Test Performance
Test AUC: 0.6400


              precision    recall  f1-score   support

           0       0.78      0.85      0.81       211
           1       0.55      0.43      0.48        89

    accuracy                           0.73       300
   macro avg       0.66      0.64      0.65       300
weighted avg       0.71      0.73      0.72       300



### Balanced

In [9]:
model = LogisticRegression(random_state=SEED,
                           solver='liblinear',
                           max_iter=2000, 
                           class_weight='balanced',
                          )

display_model_scores(model, X_train1, y_train, X_test1, y_test,
                     'Logistic Regression Balanced')

Model: Logistic Regression Balanced
Cross Validation Performance
CV Mean ROC AUC: 0.7822
CV Variance ROC AUC: 0.0027


Test Performance
Test AUC: 0.6910


              precision    recall  f1-score   support

           0       0.84      0.70      0.76       211
           1       0.49      0.69      0.57        89

    accuracy                           0.69       300
   macro avg       0.66      0.69      0.67       300
weighted avg       0.74      0.69      0.70       300



In [10]:
model.fit(X_train1, y_train)
coef_series = pd.Series(model.coef_.reshape(-1), name='coef')
feature_names_series = pd.Series(model.feature_names_in_, name='feature')

df_feature_importance = pd.concat([feature_names_series, coef_series], axis=1)
df_feature_importance.sort_values(by='coef', ascending=False)

Unnamed: 0,feature,coef
38,property_A123,0.514532
11,credit_history_A31,0.459054
21,purpose_A46,0.435852
35,other_debtors_or_guarantors_A102,0.429551
39,property_A124,0.429359
5,no_of_existing_credits,0.410524
46,job_A174,0.349022
44,job_A172,0.340294
37,property_A122,0.319934
2,installment_rate_percent,0.287844


## Decision Tree

### Without Balanced

In [11]:
model = DecisionTreeClassifier(random_state=SEED,
                               criterion='entropy',
                               min_samples_split=100,
                               )

display_model_scores(model, X_train1, y_train, X_test1, y_test,
                     'Decision Tree without Balanced')

Model: Decision Tree without Balanced
Cross Validation Performance
CV Mean ROC AUC: 0.7166
CV Variance ROC AUC: 0.0034


Test Performance
Test AUC: 0.6687


              precision    recall  f1-score   support

           0       0.79      0.88      0.83       211
           1       0.61      0.46      0.53        89

    accuracy                           0.75       300
   macro avg       0.70      0.67      0.68       300
weighted avg       0.74      0.75      0.74       300



### Balanced

In [12]:
model = DecisionTreeClassifier(random_state=SEED,
                               criterion='entropy',
                               min_samples_split=100,
                               class_weight='balanced'
                               )

display_model_scores(model, X_train1, y_train, X_test1, y_test,
                     'Decision Tree Balanced')

Model: Decision Tree Balanced
Cross Validation Performance
CV Mean ROC AUC: 0.7168
CV Variance ROC AUC: 0.0036


Test Performance
Test AUC: 0.6363


              precision    recall  f1-score   support

           0       0.81      0.62      0.70       211
           1       0.42      0.65      0.51        89

    accuracy                           0.63       300
   macro avg       0.61      0.64      0.61       300
weighted avg       0.69      0.63      0.65       300



In [13]:
model.fit(X_train1, y_train)
feature_importance_series = pd.Series(model.feature_importances_, name='feature_importance')
feature_series = pd.Series(model.feature_names_in_, name='features')

df_feature_importance = pd.concat([feature_series, feature_importance_series], axis=1)
df_feature_importance = df_feature_importance.sort_values(by='feature_importance', ascending=False)

mask = df_feature_importance.feature_importance > 0
df_feature_importance.loc[mask, :]

Unnamed: 0,features,feature_importance
10,existing_checking_account_status_A14,0.376498
0,duration_mth,0.188147
41,other_installment_plans_A143,0.088733
14,credit_history_A34,0.085541
7,monthly_affordability_amount,0.076667
27,saving_accounts_or_bonds_A65,0.076289
15,purpose_A41,0.043982
26,saving_accounts_or_bonds_A64,0.038232
39,property_A124,0.025913


## SVM

### Without Balanced

In [14]:
model = SVC(random_state=SEED,
            kernel='rbf', 
            C=2,
            probability=True,
           )
display_model_scores(model, X_train1, y_train, X_test1, y_test,
                     'SVM without Balanced')

Model: SVM without Balanced
Cross Validation Performance
CV Mean ROC AUC: 0.5751
CV Variance ROC AUC: 0.0067


Test Performance
Test AUC: 0.5234


              precision    recall  f1-score   support

           0       0.71      0.99      0.83       211
           1       0.71      0.06      0.10        89

    accuracy                           0.71       300
   macro avg       0.71      0.52      0.47       300
weighted avg       0.71      0.71      0.61       300



### Balanced

In [15]:
model = SVC(random_state=SEED,
            kernel='rbf', 
            C=2,
            probability=True,
            class_weight='balanced'
           )
display_model_scores(model, X_train1, y_train, X_test1, y_test,
                     'SVM Balanced')

Model: SVM Balanced
Cross Validation Performance
CV Mean ROC AUC: 0.5904
CV Variance ROC AUC: 0.0064


Test Performance
Test AUC: 0.5418


              precision    recall  f1-score   support

           0       0.73      0.79      0.76       211
           1       0.37      0.29      0.33        89

    accuracy                           0.64       300
   macro avg       0.55      0.54      0.54       300
weighted avg       0.62      0.64      0.63       300

