In [7]:
import pandas as pd

import numpy as np

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


from utils import remove_top_quantile


import pickle

### Data preparation and feature engineering

In [8]:
# load data
df = pd.read_csv('../data/GiveMeSomeCredit-training.csv')
df = df.drop(["Unnamed: 0"], axis=1)

# impute missing values
imputer = SimpleImputer(strategy='median')

columns = df.columns
index = df.index    
df = pd.DataFrame(imputer.fit_transform(df))
df.columns = columns
df.index = index

# outlier removal
df = remove_top_quantile(df, "RevolvingUtilizationOfUnsecuredLines", 0.99)
df = remove_top_quantile(df, "DebtRatio", 0.99)
df = remove_top_quantile(df, "MonthlyIncome", 0.99)

# feature engineering
def f(a):
    # never late
    if a["NumberOfTime30-59DaysPastDueNotWorse"] == 0 and \
        a["NumberOfTime60-89DaysPastDueNotWorse"] == 0 and \
        a["NumberOfTimes90DaysLate"] == 0:
            return 0
    # 30-59 late
    if a["NumberOfTime30-59DaysPastDueNotWorse"] != 0 and \
        a["NumberOfTime60-89DaysPastDueNotWorse"] == 0 and \
        a["NumberOfTimes90DaysLate"] == 0:
            return 1
    # 60-89 late
    if a["NumberOfTime30-59DaysPastDueNotWorse"] != 0 and \
        a["NumberOfTime60-89DaysPastDueNotWorse"] != 0 and \
        a["NumberOfTimes90DaysLate"] == 0:
            return 2
    # 90+ late
    return 3


df["PastDueSevereness"] = df.apply(f, axis=1)

# drop
df = df.drop(["NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse"], axis=1)

# scaling
scaler = StandardScaler()
df[['age',
    'NumberOfDependents',
    'MonthlyIncome',
    'DebtRatio', 
    'RevolvingUtilizationOfUnsecuredLines', 
    'NumberOfOpenCreditLinesAndLoans', 
    'NumberRealEstateLoansOrLines'
]] = scaler.fit_transform(
    df[['age',
        'NumberOfDependents',
        'MonthlyIncome',
        'DebtRatio', 
        'RevolvingUtilizationOfUnsecuredLines', 
        'NumberOfOpenCreditLinesAndLoans', 
        'NumberRealEstateLoansOrLines'
    ]])


### Load and process test data

In [9]:
# load data
df_test = pd.read_csv('../data/GiveMeSomeCredit-testing.csv')
df_test = df_test.drop(["Unnamed: 0"], axis=1)

# impute
columns = df_test.columns
index = df_test.index    
df_test = pd.DataFrame(imputer.transform(df_test))
df_test.columns = columns
df_test.index = index

# apply new feature
df_test["PastDueSevereness"] = df_test.apply(f, axis=1)

# drop
df_test = df_test.drop(["NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse"], axis=1)

# scale
df_test[['age',
    'NumberOfDependents',
    'MonthlyIncome',
    'DebtRatio', 
    'RevolvingUtilizationOfUnsecuredLines', 
    'NumberOfOpenCreditLinesAndLoans', 
    'NumberRealEstateLoansOrLines'
]] = scaler.transform(
    df_test[['age',
        'NumberOfDependents',
        'MonthlyIncome',
        'DebtRatio', 
        'RevolvingUtilizationOfUnsecuredLines', 
        'NumberOfOpenCreditLinesAndLoans', 
        'NumberRealEstateLoansOrLines'
    ]])

### Split data into input and target

In [10]:
X_train = df[[
    "RevolvingUtilizationOfUnsecuredLines",
    "age",	
    "DebtRatio",	
    "MonthlyIncome",	
    "NumberOfOpenCreditLinesAndLoans",	
    "NumberRealEstateLoansOrLines", 
    "NumberOfDependents", 	
    "PastDueSevereness"
]]

y_train = df[["SeriousDlqin2yrs"]]

X_test = df_test[[
    "RevolvingUtilizationOfUnsecuredLines",
    "age",	
    "DebtRatio",	
    "MonthlyIncome",	
    "NumberOfOpenCreditLinesAndLoans",	
    "NumberRealEstateLoansOrLines", 
    "NumberOfDependents", 	
    "PastDueSevereness"
]]

y_test = df_test[["SeriousDlqin2yrs"]]

### Init models

In [None]:
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree' : DecisionTreeClassifier()
}

"""param_grids = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10, 100]},
    'Random Forest': {'n_estimators': [50,100,150,200]
                        , 'max_depth': [None, 10, 20, 30]
                        ,'n_jobs':[-1]
                        ,'max_leaf_nodes':[89]
                        ,'max_features':[5]
                        ,'min_samples_split':[11]
                        ,'min_samples_leaf':[1]
                        ,'min_impurity_decrease':[0.0]},
    'Decision Tree':{'max_depth': [None, 10, 20, 30, 40, 50],'criterion':['entropy','gini']}
}"""
param_grids = {
    'Logistic Regression': {'C': [1]},
    'Random Forest': {'n_estimators': [10], 'max_depth': [5]},
    'Decision Tree':{'max_depth': [5],'criterion':['gini']}
}

### Grid search

In [12]:
results = {}
for model_name, model in models.items():
        
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=cv, scoring='accuracy')

    # Fit the grid search on the training data
    print(f"... Fitting {model_name} with params\n... \t{param_grids[model_name]}")
    grid_search.fit(X_train, y_train)

    # Perform cross-validation using the best estimator found by grid search
    y_pred_cv = cross_val_predict(grid_search.best_estimator_, X_test, y_test, cv=cv)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred_cv)

    results[model_name] = {'best_params': grid_search.best_params_, 'accuracy': accuracy}

# Select the best model based on accuracy
best_model_name = max(results, key=lambda key: results[key]['accuracy'])
best_model = models[best_model_name]

print(f"\nBest Model: {best_model_name}")
print(f"\nBest parameter: {results[best_model_name]['best_params']}")
print(f"\nAccuracy: {results[best_model_name]['accuracy']}")

... Fitting Logistic Regression with params
... 	{'C': [1]}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.float64(0.0)

## Fit best model on all available data

In [None]:
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

0

In [None]:
best_model.fit(X, y)

In [None]:
with open("../models/model.pkl", "wb") as f:
    pickle.dump(best_model, f)