# Modeling

In [2]:
#instantiate Logistic Regression and get up and running by the end of the day
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Import sklearn elements
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, r2_score
from sklearn.ensemble import RandomForestClassifier

#import xgboost
from xgboost import XGBClassifier

In [3]:
#read in file from csv to dataframe
data_read = pd.read_csv('./cleaned_FEATURES.csv')


In [4]:
data = data_read

In [5]:
num_cols = data.select_dtypes(include=('int64', 'float64')).columns

In [6]:
list(num_cols)

['loan_amnt',
 'int_rate',
 'installment',
 'grade',
 'emp_length',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'out_prncp',
 'out_prncp_inv',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'last_fico_range_high',
 'last_fico_range_low',
 'collections_12_mths_ex_med',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mort_acc',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_bc_tl',
 'num_il_tl',
 'num_op_rev_tl',
 'num_rev_accts',
 'num_rev_tl_bal_gt_0',
 'num_sats',
 'num_tl_30dpd',
 'num_tl_90g_dpd_24m',
 'num_tl_op_past_12m',
 'pct_tl_nvr_dlq',
 'percent_bc_gt_75',
 'pub_rec_bankruptcies',
 'tax_liens',
 'tot_hi_cred_lim',
 'total_bal_ex_mort',
 'total_il_high_credit_limit',
 'classes',
 

In [7]:
data.dtypes.value_counts()

int64      85
float64    71
object      6
dtype: int64

In [8]:
X = data[num_cols].drop(columns='classes')
y = data['classes']

In [9]:
X.dtypes.value_counts()

int64      84
float64    71
dtype: int64

In [10]:
# Create training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=420,
                                                    stratify = y)

In [11]:
X_train.shape

(847634, 155)

In [12]:
y_train.shape

(847634,)

In [29]:
def metrics(model):
    preds          = model.predict(X_test) #generate predictions
    
    test_conf      = confusion_matrix(y_test,# True values.
                                  preds) # Predicted values.
    
    tn, fp, fn, tp = test_conf.ravel() #unravel values to use in metrics
    
    accuracy       = accuracy_score(y_test, preds)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

    recall         = recall_score(y_test, preds)
    print("Recall: %.2f%%" % (recall * 100.0))

    precision      = precision_score(y_test, preds)
    print("Precision: %.2f%%" % (precision * 100.0))

    specificity    = tn / (tn+fp)
    specificity    = specificity * 100.0
    print(f"Specificity: {round((specificity),2)}%")
    
    sensitivity    = tp / (tp+fn)
    sensitivity    = sensitivity * 100.0
    print(f"Sensitivity: {round((sensitivity),2)}%")
    
    df_conf= pd.DataFrame(test_conf, index =  ['Actual Failure', 'Actual Success'],
                    columns = ['Predicted Failure', 'Predicted Success'])
    return(df_conf)

In [16]:
def cv_score(model):
    cv_train = cross_val_score(model, X_train, y_train, cv=3).mean()
    cv_test = cross_val_score(model, X_test, y_test, cv=3).mean()
    print(f'CV Score for Training: {cv_train}')
    print(f'CV Score for Testing: {cv_test}')

## Logistic Regression

**Assumptions of Logistic Regression:**
- Linearity: The independent variables X1, . . . , Xm are linearly related to the logit of the probability that Y = 1 or, equivalently, the log-odds that Y = 1.
- Independence of Errors: The observations y1, . . . , yn are independent of one another.
- Distribution of Errors: Each observation yi follows a Bernoulli distribution with probability of success pi.
- Independence of Independent Variables: The independent variables X1, . . . , Xm are independent of one another.

In [13]:
# Step 1: Instantiate our model.
logreg = LogisticRegression(solver='liblinear')

# Step 2: Fit our model.
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
#return the intercept and coefficients
print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [6.32961328e-14]
Logistic Regression Coefficient: [[ 4.07904076e-10  5.96308983e-13  1.38685475e-11  3.97652170e-13
   3.65134606e-13  3.58958541e-09  9.55160985e-13  1.44740001e-14
   4.42965257e-11  4.45497139e-11  2.93043667e-14  6.55296536e-13
   1.43212357e-14  5.74296441e-10  2.94267183e-12  1.40966286e-12
  -8.16589498e-11 -8.16419450e-11 -8.55622428e-11 -1.43699681e-11
   7.19649478e-10  5.33124581e-11  5.63098828e-11  8.04785828e-16
   1.81214189e-16  1.84079718e-11  3.52829790e-09  2.31774505e-13
   3.75436627e-10  5.98756624e-10  3.48069342e-12  4.89812832e-16
  -1.17568426e-13  7.59389274e-14  3.01246166e-14  2.01801095e-13
   3.05684566e-13  2.69587830e-13  4.85434240e-13  4.47955437e-13
   4.80342780e-13  8.71571369e-13  3.05066804e-13  6.52654291e-13
   9.89937753e-17  4.57196640e-15  1.06423871e-13  5.96500407e-12
   2.45818434e-12  9.89397629e-15  2.54569938e-15  4.86727510e-09
   1.63626464e-09  1.40523224e-09  3.53115575e-10  1.21189313

In [15]:
## how to better represent the coefficients?

In [17]:
#how to visualize these results?
print(f'Logistic Regression train score: {logreg.score(X_train, y_train)}')
print(f'Logistic Regression test score: {logreg.score(X_test, y_test)}')

Logistic Regression train score: 0.7890657996257818
Logistic Regression test score: 0.7891634918188024


In [36]:
cv_train = cross_val_score(logreg, X_train, y_train, cv=3).mean()
cv_test = cross_val_score(logreg, X_test, y_test, cv=3).mean()
print(f'CV Score for Training: {cv_train}')
print(f'CV Score for Testing: {cv_test}')

CV Score for Training: 0.7894504068921031
CV Score for Testing: 0.7955884107197421


In [30]:
metrics(logreg)

Accuracy: 78.92%
Recall: 99.91%
Precision: 78.91%
Specificity: 1.36%
Sensitivity: 99.91%


Unnamed: 0,Predicted Failure,Predicted Success
Actual Failure,1055,76335
Actual Success,256,285626


In [21]:
data['classes'].value_counts(normalize=True)

1    0.786963
0    0.213037
Name: classes, dtype: float64

## Random Forest

In [22]:
rf = RandomForestClassifier(max_depth= 5, max_features= 5, n_estimators= 50)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [33]:
print(f'Random Forest train score: {rf.score(X_train, y_train)}')
print(f'Random Forest test score: {rf.score(X_test, y_test)}')

Random Forest train score: 0.9180058846152939
Random Forest test score: 0.9171942786672246


In [31]:
metrics(rf)

Accuracy: 91.72%
Recall: 100.00%
Precision: 90.48%
Specificity: 61.13%
Sensitivity: 100.0%


Unnamed: 0,Predicted Failure,Predicted Success
Actual Failure,47309,30081
Actual Success,0,285882


best params result: 
{'max_depth': 5, 'max_features': 5, 'n_estimators': 50}

## XGBoost

In [26]:
xgb = XGBClassifier()

In [27]:
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [32]:
metrics(xgb)

Accuracy: 97.75%
Recall: 98.68%
Precision: 98.46%
Specificity: 94.32%
Sensitivity: 98.68%


Unnamed: 0,Predicted Failure,Predicted Success
Actual Failure,72992,4398
Actual Success,3779,282103


In [35]:
print(f'XGBoost train score: {xgb.score(X_train, y_train)}')
print(f'XGBoost test score: {xgb.score(X_test, y_test)}')

XGBoost train score: 0.9779362319114145
XGBoost test score: 0.9774906956770685


### VISUALIZATION FOR RESULTS OF THE FOLLOWING TESTS:
1. Logistic Regression
2. Random Forest
3. XGBoost

Conclusions: As expected, XGBoost is the undisputed champion in the classification engine world! This is the first time using XGBoost, but after hearing about all of the success attributed to it for Kaggle competitions I had to give it a shot. Honestly, this surpasses even my wildest expectations in all metrics. As an important takeaway, it also minimized the potential costs to investors as determined by False Positives- when the algorithm predicted Successful loans but they ended up being Failures. 