In [373]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import math
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import xgboost
import pickle

import statsmodels.api as sm

In [365]:
data = pd.read_csv ('../data/data.csv')

In [366]:
y = data ['default'].values

In [367]:
X = np.asarray (data.set_index ('uid').drop ('default', axis = 1))

In [369]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=0)

# Logistic Regression (with statsmodels) for score

In [370]:
# This is to interpret the effectiveness of the credit score
# I use the full data set—there is no performance to be evaluated, it's about relationships

In [368]:
f_1 = np.asarray (data['f_1'])

In [371]:
lr = sm.Logit (endog = y, exog = f_1).fit ()

Optimization terminated successfully.
         Current function value: 0.467339
         Iterations 6


In [372]:
print (lr.summary ())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                11956
Model:                          Logit   Df Residuals:                    11955
Method:                           MLE   Df Model:                            0
Date:                Sun, 29 May 2022   Pseudo R-squ.:                  0.2803
Time:                        18:50:52   Log-Likelihood:                -5587.5
converged:                       True   LL-Null:                       -7763.2
Covariance Type:            nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -1.7333      0.030    -57.626      0.000      -1.792      -1.674


# Gradient Boosting

In [337]:
gb_pipe = Pipeline([('gb', GradientBoostingClassifier())])

params = [{'gb__n_estimators': [1000, 2000],
           'gb__learning_rate': [0.01, 0.1, 0.2],
         'gb__max_depth': [1, 2, 3]}]

gs_gb = GridSearchCV(gb_pipe,
                      param_grid=params,
                      scoring='f1',
                      cv=5)
gs_gb.fit(X_train, y_train)
gs_gb.best_params_

{'gb__learning_rate': 0.01, 'gb__max_depth': 3, 'gb__n_estimators': 1000}

In [361]:
gb = (GradientBoostingClassifier(n_estimators = 1000, max_depth=3, learning_rate = 0.01,
                                 random_state=0)
      .fit(X_train, y_train))

gb.score(X_test, y_test)

0.9360367892976589

In [362]:
predictions = gb.predict(X_test)

metrics.confusion_matrix(y_test, predictions)

array([[1439,  107],
       [  46,  800]])

In [363]:
metrics.f1_score (y_test, predictions)

0.9127210496292071

# Random Forest

In [329]:
rf_pipe = Pipeline([('rf', RandomForestClassifier())])

params = [{'rf__n_estimators': [500, 1000],
          'rf__min_samples_leaf' : [2, 0.01, 0.05]}]

gs_rf = GridSearchCV(rf_pipe,
                     param_grid=params,
                      scoring='f1',
                      cv=5)

gs_rf.fit(X_train, y_train)
gs_rf.best_params_

{'rf__min_samples_leaf': 2, 'rf__n_estimators': 500}

In [339]:
rf = (RandomForestClassifier(n_estimators = 1000,, random_state=0)
      .fit(X_train, y_train))

rf.score(X_test, y_test)

0.927675585284281

In [354]:
predictions = rf.predict(X_test)

metrics.confusion_matrix(y_test, predictions)

array([[1438,  108],
       [  65,  781]])

In [355]:
metrics.f1_score (y_test, predictions)

0.9002881844380403

# XGBoost

In [338]:
xg_pipe = Pipeline([('xg', xgboost.XGBClassifier())])

params = [{
    'xg__max_depth': [2, 5],
    'xg__n_estimators': [200, 500, 1000],
    'xg__learning_rate': [0.1, 0.05]
}]

gs_xg = GridSearchCV(xg_pipe,
                     param_grid=params,
                      scoring='f1',
                      cv=5)

gs_xg.fit(X_train, y_train)
gs_xg.best_params_

{'xg__learning_rate': 0.05, 'xg__max_depth': 5, 'xg__n_estimators': 200}

In [344]:
xg = (xgboost.XGBClassifier(n_estimators = 200, learning_rate = 0.05, 
                            max_depth = 5,
                            random_state=0)
      .fit(X_train, y_train))

xg.score(X_test, y_test, )

0.9364548494983278

In [351]:
predictions = xg.predict(X_test)

metrics.confusion_matrix(y_test, predictions)

array([[1438,  108],
       [  44,  802]])

In [353]:
metrics.f1_score (y_test, predictions)

0.9134396355353075

## Final model trained using the full data set

In [374]:
model = (xgboost.XGBClassifier(n_estimators = 200, learning_rate = 0.05, 
                            max_depth = 5,
                            random_state=0)
      .fit(X, y))

In [375]:
pickle.dump(model, open('../artifacts/model.sav', 'wb'))