In [331]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import math
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import xgboost

import statsmodels.api as sm

In [216]:
data = pd.read_csv ('../data/data.csv')

In [189]:
objects = [x for x in data.columns.values if '_o_' in x]

In [190]:
y = data ['default'].values

In [191]:
X = np.asarray (data.set_index ('uid').drop ('default', axis = 1))

In [291]:
X_no_object = np.asarray (data.set_index ('uid').drop (['default'] +
                                                       objects, axis = 1))

In [292]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=0)

X_no_train, X_no_test, y_no_train, y_no_test = train_test_split(X_no_object, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=0)

# Logistic Regression (with statsmodels) for score

In [323]:
lr = sm.Logit (endog = y, exog = X_no_object [:, 0]).fit ()

Optimization terminated successfully.
         Current function value: 0.467339
         Iterations 6


In [324]:
print (lr.summary ())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                11956
Model:                          Logit   Df Residuals:                    11955
Method:                           MLE   Df Model:                            0
Date:                Sun, 29 May 2022   Pseudo R-squ.:                  0.2803
Time:                        15:17:03   Log-Likelihood:                -5587.5
converged:                       True   LL-Null:                       -7763.2
Covariance Type:            nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -1.7333      0.030    -57.626      0.000      -1.792      -1.674


# Gradient Boosting

In [None]:
params = [{'knn__n_neighbors': [3, 5, 7, 9],
         'knn__weights': ['uniform', 'distance'],
         'knn__leaf_size': [15, 20]}]

In [320]:
gb_pipe = Pipeline([('gb', GradientBoostingClassifier())])

params = [{'gb__learning_rate': [0.05, 0.1, 0.2],
         'gb__max_depth': [1, 2]}]

gs_gb = GridSearchCV(gb_pipe,
                      param_grid=params,
                      scoring='f1',
                      cv=5)
gs_gb.fit(X_train, y_train)
gs_gb.best_params_

{'gb__learning_rate': 0.1, 'gb__max_depth': 2}

In [None]:
predictions = gr.predict(X_test)

cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

# Random Forest

In [329]:
rf_pipe = Pipeline([('rf', RandomForestClassifier())])

params = [{'rf__n_estimators': [500, 1000],
          'rf__min_samples_leaf' : [2, 0.01, 0.05]}]

gs_rf = GridSearchCV(rf_pipe,
                     param_grid=params,
                      scoring='f1',
                      cv=5)

gs_rf.fit(X_train, y_train)
gs_rf.best_params_

{'rf__min_samples_leaf': 2, 'rf__n_estimators': 500}

In [67]:
rf = (RandomForestClassifier(n_estimators = 1000, max_depth=30, random_state=0)
      .fit(X_train, y_train))

rf.score(X_test, y_test)

0.9285117056856187

In [68]:
predictions = rf.predict(X_test)

cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

[[1439  107]
 [  64  782]]


# XGBoost

In [None]:
xg_pipe = Pipeline([('xg', xgboost.XGBClassifier())])

params = [{
    'xg__max_depth': [2, 10],
    'xg__n_estimators': [50, 100, 200],
    'xg__learning_rate': [0.1, 0.01]
}]

gs_xg = GridSearchCV(xg_pipe,
                     param_grid=params,
                      scoring='f1',
                      cv=5)

gs_rf.fit(X_train, y_train)
gs_rf.best_params_