# Setup

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

import os

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split

## Global Configuration

In [2]:
INPUT_DIR = '../data/input/'
INPUT_PATH = f'{INPUT_DIR}german_data.csv'
OUTPUT_DIR = INPUT_DIR
EDA_DIR = '../data/EDA/'
PIPELINE_DIR = '../data/pipeline/'
OUTPUT = False
SEED = 888

In [3]:
pd.options.display.max_columns = None

## Global Function

In [4]:
def display_scores(title, scores, y_test, y_pred):
    print(f'Model: {title}')
    print('Cross Validation Performance')
    print(f'CV Mean ROC AUC: {np.mean(scores):.4f}')
    print(f'CV Variance ROC AUC: {np.var(scores):.4f}')
    print('\n')

    print('Test Performance')
    print(f'Test AUC: {roc_auc_score(y_test, y_pred ):.4f}')
    print('\n')
    print(classification_report(y_test, y_pred))


def display_model_scores(model, X_train, y_train, X_test, y_test, title):
    cv = RepeatedStratifiedKFold(n_splits=10,
                                 n_repeats=5,
                                 random_state=SEED
                                 )
    scores = cross_val_score(model, X_train, y_train,
                             scoring='roc_auc', cv=cv, n_jobs=-1
                             )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    display_scores(title, scores, y_test, y_pred)

# Data

In [5]:
df = pd.read_csv(INPUT_PATH)

cols = ['label', 'foreign_worker']
X = df.drop(cols, axis=1)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.30,
                                                    random_state=SEED
                                                   )

In [6]:
categorical_cols = X_train.select_dtypes(object).columns.tolist()
numerical_cols = [col for col in X_train if col not in categorical_cols]
categorical_cols_index = [col in categorical_cols for col in X_train.columns]
print(f'Categorical Columns: {categorical_cols}')
print('\n')
print(f'Numerical Columns: {numerical_cols}')

Categorical Columns: ['existing_checking_account_status', 'credit_history', 'purpose', 'saving_accounts_or_bonds', 'present_employment_since', 'personal_status_sex', 'other_debtors_or_guarantors', 'property', 'other_installment_plans', 'housing', 'job', 'telephone']


Numerical Columns: ['duration_mth', 'credit_amount', 'installment_rate_percent', 'present_residence_since', 'age_years', 'no_of_existing_credits', 'no_of_ppl_liable', 'monthly_affordability_amount']


# Oversampling

In [7]:
smote = SMOTENC(sampling_strategy='auto',  # not majority
                random_state=SEED,
                categorical_features=categorical_cols_index,
                n_jobs=-1,
                k_neighbors=100
                )

In [8]:
X_resample, y_resample = smote.fit_resample(X_train, y_train)

In [9]:
y_resample.value_counts()

0    489
1    489
Name: label, dtype: int64

In [10]:
X_resample1 = pd.get_dummies(X_resample)
X_test1 = pd.get_dummies(X_test)

# Baseline Model with Oversampling

## Logistic Regression

In [11]:
model = LogisticRegression(random_state=SEED,
                           solver='liblinear',
                           max_iter=2000, 
                          )

display_model_scores(model, X_resample1, y_resample, X_test1, y_test,
                     'Logistic Regression without Balanced')

Model: Logistic Regression without Balanced
Cross Validation Performance
CV Mean ROC AUC: 0.8990
CV Variance ROC AUC: 0.0016


Test Performance
Test AUC: 0.6630


              precision    recall  f1-score   support

           0       0.80      0.79      0.79       211
           1       0.52      0.54      0.53        89

    accuracy                           0.71       300
   macro avg       0.66      0.66      0.66       300
weighted avg       0.72      0.71      0.72       300



In [12]:
model.fit(X_resample1, y_resample)
coef_series = pd.Series(model.coef_.reshape(-1), name='coef')
feature_names_series = pd.Series(model.feature_names_in_, name='feature')

df_feature_importance = pd.concat([feature_names_series, coef_series], axis=1)
df_feature_importance.sort_values(by='coef', ascending=False)

Unnamed: 0,feature,coef
17,purpose_A40,1.059837
8,existing_checking_account_status_A11,0.988241
27,saving_accounts_or_bonds_A61,0.800173
24,purpose_A46,0.751693
13,credit_history_A31,0.50792
46,property_A123,0.459888
34,present_employment_since_A73,0.432799
20,purpose_A42,0.376524
12,credit_history_A30,0.277263
5,no_of_existing_credits,0.251242


## Decision Tree

In [13]:
model = DecisionTreeClassifier(random_state=SEED,
                               criterion='entropy',
                               min_samples_split=100,
                               )

display_model_scores(model, X_resample1, y_resample, X_test1, y_test,
                     'Decision Tree without Balanced')

Model: Decision Tree without Balanced
Cross Validation Performance
CV Mean ROC AUC: 0.8703
CV Variance ROC AUC: 0.0016


Test Performance
Test AUC: 0.6799


              precision    recall  f1-score   support

           0       0.81      0.79      0.80       211
           1       0.53      0.57      0.55        89

    accuracy                           0.72       300
   macro avg       0.67      0.68      0.68       300
weighted avg       0.73      0.72      0.73       300



In [14]:
model.fit(X_resample1, y_resample)
feature_importance_series = pd.Series(model.feature_importances_, name='feature_importance')
feature_series = pd.Series(model.feature_names_in_, name='features')

df_feature_importance = pd.concat([feature_series, feature_importance_series], axis=1)
df_feature_importance = df_feature_importance.sort_values(by='feature_importance', ascending=False)

mask = df_feature_importance.feature_importance > 0
df_feature_importance.loc[mask, :]

Unnamed: 0,features,feature_importance
8,existing_checking_account_status_A11,0.470836
46,property_A123,0.154438
9,existing_checking_account_status_A12,0.061129
34,present_employment_since_A73,0.058424
50,other_installment_plans_A143,0.051781
0,duration_mth,0.050646
21,purpose_A43,0.032814
1,credit_amount,0.026977
27,saving_accounts_or_bonds_A61,0.023723
31,saving_accounts_or_bonds_A65,0.013034


## SVM

In [15]:
model = SVC(random_state=SEED,
            kernel='rbf', 
            C=2,
            probability=True,
           )
display_model_scores(model, X_resample1, y_resample, X_test1, y_test,
                     'SVM without Balanced')

Model: SVM without Balanced
Cross Validation Performance
CV Mean ROC AUC: 0.5787
CV Variance ROC AUC: 0.0029


Test Performance
Test AUC: 0.5184


              precision    recall  f1-score   support

           0       0.71      0.71      0.71       211
           1       0.32      0.33      0.32        89

    accuracy                           0.60       300
   macro avg       0.52      0.52      0.52       300
weighted avg       0.60      0.60      0.60       300

