In [1]:
import pandas as pd

import numpy as np

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


from utils import remove_top_quantile


import pickle

### Data preparation and feature engineering

In [2]:
# load data
df = pd.read_csv('../data/GiveMeSomeCredit-training.csv')
df = df.drop(["Unnamed: 0"], axis=1)

# impute missing values
imputer = SimpleImputer(strategy='median')

columns = df.columns
index = df.index    
df = pd.DataFrame(imputer.fit_transform(df))
df.columns = columns
df.index = index

# outlier removal
df = remove_top_quantile(df, "RevolvingUtilizationOfUnsecuredLines", 0.99)
df = remove_top_quantile(df, "DebtRatio", 0.99)
df = remove_top_quantile(df, "MonthlyIncome", 0.99)

# feature engineering
def f(a):
    # never late
    if a["NumberOfTime30-59DaysPastDueNotWorse"] == 0 and \
        a["NumberOfTime60-89DaysPastDueNotWorse"] == 0 and \
        a["NumberOfTimes90DaysLate"] == 0:
            return 0
    # 30-59 late
    if a["NumberOfTime30-59DaysPastDueNotWorse"] != 0 and \
        a["NumberOfTime60-89DaysPastDueNotWorse"] == 0 and \
        a["NumberOfTimes90DaysLate"] == 0:
            return 1
    # 60-89 late
    if a["NumberOfTime30-59DaysPastDueNotWorse"] != 0 and \
        a["NumberOfTime60-89DaysPastDueNotWorse"] != 0 and \
        a["NumberOfTimes90DaysLate"] == 0:
            return 2
    # 90+ late
    return 3


df["PastDueSevereness"] = df.apply(f, axis=1)

# drop
df = df.drop(["NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse"], axis=1)

# scaling
scaler = StandardScaler()
df[['age',
    'NumberOfDependents',
    'MonthlyIncome',
    'DebtRatio', 
    'RevolvingUtilizationOfUnsecuredLines', 
    'NumberOfOpenCreditLinesAndLoans', 
    'NumberRealEstateLoansOrLines'
]] = scaler.fit_transform(
    df[['age',
        'NumberOfDependents',
        'MonthlyIncome',
        'DebtRatio', 
        'RevolvingUtilizationOfUnsecuredLines', 
        'NumberOfOpenCreditLinesAndLoans', 
        'NumberRealEstateLoansOrLines'
    ]])


### Load and process test data

In [3]:
# load data
df_test = pd.read_csv('../data/GiveMeSomeCredit-eval.csv')
df_test = df_test.drop(["Unnamed: 0"], axis=1)

# impute
columns = df_test.columns
index = df_test.index    
df_test = pd.DataFrame(imputer.transform(df_test))
df_test.columns = columns
df_test.index = index

# apply new feature
df_test["PastDueSevereness"] = df_test.apply(f, axis=1)

# drop
df_test = df_test.drop(["NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse"], axis=1)

# scale
df_test[['age',
    'NumberOfDependents',
    'MonthlyIncome',
    'DebtRatio', 
    'RevolvingUtilizationOfUnsecuredLines', 
    'NumberOfOpenCreditLinesAndLoans', 
    'NumberRealEstateLoansOrLines'
]] = scaler.transform(
    df_test[['age',
        'NumberOfDependents',
        'MonthlyIncome',
        'DebtRatio', 
        'RevolvingUtilizationOfUnsecuredLines', 
        'NumberOfOpenCreditLinesAndLoans', 
        'NumberRealEstateLoansOrLines'
    ]])

### Split data into input and target

In [4]:
X_train = df[[
    "RevolvingUtilizationOfUnsecuredLines",
    "age",	
    "DebtRatio",	
    "MonthlyIncome",	
    "NumberOfOpenCreditLinesAndLoans",	
    "NumberRealEstateLoansOrLines", 
    "NumberOfDependents", 	
    "PastDueSevereness"
]]

y_train = df[["SeriousDlqin2yrs"]]

X_test = df_test[[
    "RevolvingUtilizationOfUnsecuredLines",
    "age",	
    "DebtRatio",	
    "MonthlyIncome",	
    "NumberOfOpenCreditLinesAndLoans",	
    "NumberRealEstateLoansOrLines", 
    "NumberOfDependents", 	
    "PastDueSevereness"
]]

y_test = df_test[["SeriousDlqin2yrs"]]

### Init models

In [None]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(class_weight="balanced"),
    'Random Forest': RandomForestClassifier(class_weight="balanced"),
    'Decision Tree' : DecisionTreeClassifier(class_weight="balanced")
}

param_grids = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1]},
    'Random Forest': {'n_estimators': [10, 15], 'max_depth': [5, 10]},
    'Decision Tree':{'max_depth': [5, 10],'criterion':['gini']}
}

### Grid search

In [6]:
results = {}
for model_name, model in models.items():
        
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=cv)

    # Fit the grid search on the training data
    print(f"... Fitting {model_name} with params\n... \t{param_grids[model_name]}")
    grid_search.fit(X_train, y_train["SeriousDlqin2yrs"])

    # Perform cross-validation using the best estimator found by grid search
    y_pred_test = cross_val_predict(grid_search.best_estimator_, X_test, y_test["SeriousDlqin2yrs"], cv=cv)

    # Calculate accuracy, precision, recall and F1 score
    accuracy = accuracy_score(y_test, y_pred_test)
    precision = precision_score(y_test, y_pred_test)
    recall = recall_score(y_test, y_pred_test)
    f1 = f1_score(y_test, y_pred_test)

    # save results
    results[model_name] = {
        'best_params': grid_search.best_params_, 
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

... Fitting Logistic Regression with params
... 	{'C': [0.001, 0.01, 0.1, 1]}


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


... Fitting Random Forest with params
... 	{'n_estimators': [10, 15], 'max_depth': [5, 10]}
... Fitting Decision Tree with params
... 	{'max_depth': [5, 10], 'criterion': ['gini']}


### Print best model
Find the best model by F1 score on valid dataset

In [7]:
best_model_name = max(results, key=lambda key: results[key]['f1'])
best_model = models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Best parameter: {results[best_model_name]['best_params']}")
print(f"Accuracy: {results[best_model_name]['accuracy']}")
print(f"Precision: {results[best_model_name]['precision']}")
print(f"Recall: {results[best_model_name]['recall']}")
print(f"F1: {results[best_model_name]['f1']}")

Best Model: Random Forest
Best parameter: {'max_depth': 10, 'n_estimators': 10}
Accuracy: 0.8904109589041096
Precision: 0.3274907749077491
Recall: 0.4916897506925208
F1: 0.39313399778516056
