# Scoring function

In [1]:
from sklearn.metrics import f1_score, confusion_matrix, precision_score, accuracy_score

def score(y_true, y_pred):
    A = confusion_matrix(y_true, y_pred)
    print(
    f''' 
    F1 score              | {round(f1_score(y_true, y_pred)*100, 2)}        \n
    -----------------------------
    Precision score       | {round(precision_score(y_true, y_pred)*100, 2)} \n
    -----------------------------
    Accuracy              | {round(accuracy_score(y_true, y_pred)*100, 2)}  \n
    -----------------------------
    Confusion matrix | {A[0,0]} {A[0,1]}
                       {A[1,0]} {A[1,1]}
    '''
    )

# Preprocessing

In [7]:
import pandas as pd
import numpy as np
from joblib import load

ord_encoder     = load('./models/encoders/ordinal_encoder.joblib')
one_hot_encoder = load('./models/encoders/one_hot_encoder.joblib')
std_sc          = load('./models/encoders/standard_scaler.joblib')

def preprocessing(df):
    df = df.drop(index=df.loc[df['loan_status']=='Current', :].index)
    X = df.loc[df['sub_grade'].apply(lambda row: False if row[0] in ['A', 'B','C'] else True), :]
    y = X['loan_status'].replace({'Defaulted':0, 'FullyPaid':1})
    X = X.iloc[:, :-1]
    
    X_ord  = ord_encoder.transform(X)

    X_oh   = one_hot_encoder.transform(X)

    drop_first = ['term_1', 'home_ownership_1', 'purpose_1','addr_state_1',
                  'initial_list_status_1','sub_grade_1','emp_length_1']
    X_oh  = X_oh.drop(columns = drop_first)
    
    X_oh   = std_sc.transform(X_oh)
    
    return X_ord, X_oh, y

# Load models

In [3]:
from tensorflow.keras.models import load_model
from joblib import load

model_rf    = load('./models/RandomForest_1.joblib')
model_xgb   = load('./models/XGBoost_1.joblib')
model_rf2   = load('./models/RandomForest_2.joblib')
model_xgb2  = load('./models/XGBoost_2.joblib')
model_lgbm  = load('./models/LGBM_1.joblib')
model_logit = load('./models/logit.joblib')
model_nn    = load_model('./models/shallowNN_1')

models_ord = [model_rf, model_xgb, model_rf2, model_xgb2, model_lgbm] 
models_oh  = [model_logit, model_nn]

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Ensemble models

In [4]:
from functools import reduce

def ensemble(X_ord, X_oh, models_ord, models_oh, treshold = None):
    if treshold == None:
        treshold = (len(models_ord)+ len(models_oh))//2 + 1
    predictions = []
    for model in models_ord:
        predictions.append(pd.Series(model.predict(X_ord)))
    for model in models_oh:
        predictions.append(pd.Series(model.predict(X_oh).reshape(-1)).apply(lambda x: 1 if x > .5 else 0))
    
    y_sum = reduce(lambda x, y: x+y, predictions)
    return y_sum.apply(lambda x: 1 if x >= treshold else 0)

# Testing

In [6]:
test2015 =  pd.read_csv('/Users/ivanpassoni/Google Drive/LendingClubData/ml datasets/accepted_2015_ml.csv',
                        low_memory = False)
test2016 =  pd.read_csv('/Users/ivanpassoni/Google Drive/LendingClubData/ml datasets/accepted_2016_ml.csv',
                        low_memory = False)
test2017 =  pd.read_csv('/Users/ivanpassoni/Google Drive/LendingClubData/ml datasets/accepted_2017_ml.csv',
                        low_memory = False)
test2018 =  pd.read_csv('/Users/ivanpassoni/Google Drive/LendingClubData/ml datasets/accepted_2018_ml.csv',
                        low_memory = False)

In [8]:
X_2015_ord, X_2015_oh, y_2015 = preprocessing(test2015)
X_2016_ord, X_2016_oh, y_2016 = preprocessing(test2016)
X_2017_ord, X_2017_oh, y_2017 = preprocessing(test2017)
X_2018_ord, X_2018_oh, y_2018 = preprocessing(test2018)


y_2015p = ensemble(X_2015_ord, X_2015_oh, models_ord, models_oh, 7)
y_2016p = ensemble(X_2016_ord, X_2016_oh, models_ord, models_oh, 7)
y_2017p = ensemble(X_2017_ord, X_2017_oh, models_ord, models_oh, 7)
y_2018p = ensemble(X_2018_ord, X_2018_oh, models_ord, models_oh, 7)

In [14]:
print ('   -------------2015-------------')
score(y_2015, y_2015p)
# Base
print('Base case:')
score(y_2015, pd.Series(np.ones(len(y_2015))))

print ('   -------------2016-------------')
score(y_2016, y_2016p)
# Base
print('Base case:')
score(y_2016, pd.Series(np.ones(len(y_2016))))

print ('   -------------2017-------------')
score(y_2017, y_2017p)
# Base
print('Base case:')
score(y_2017, pd.Series(np.ones(len(y_2017))))

print ('   -------------2018-------------')
score(y_2018, y_2018p)
# Base
print('Base case:')
score(y_2018, pd.Series(np.ones(len(y_2018))))

   -------------2015-------------
 
    F1 score              | 50.56        

    -----------------------------
    Precision score       | 76.47 

    -----------------------------
    Accuracy              | 54.41  

    -----------------------------
    Confusion matrix | 28458 6565
                       35157 21331
    
Base case:
 
    F1 score              | 76.34        

    -----------------------------
    Precision score       | 61.73 

    -----------------------------
    Accuracy              | 61.73  

    -----------------------------
    Confusion matrix | 0 35023
                       0 56488
    
   -------------2016-------------
 
    F1 score              | 51.61        

    -----------------------------
    Precision score       | 68.61 

    -----------------------------
    Accuracy              | 55.54  

    -----------------------------
    Confusion matrix | 22524 7673
                       23779 16771
    
Base case:
 
    F1 score              | 72.87

feature importance for 2017

In [None]:
sorted(list(zip(X_2017_ord.columns, model_rf.feature_importances_)), key = lambda x: x[1], reverse = True)[0:9]

In [None]:
sorted(list(zip(X_2017_ord.columns, model_rf2.feature_importances_)), key = lambda x: x[1], reverse = True)[0:9]

In [None]:
sorted(list(zip(X_2017_ord.columns, model_xgb.feature_importances_)), key = lambda x: x[1], reverse = True)[0:9]

In [None]:
sorted(list(zip(X_2017_ord.columns, model_xgb2.feature_importances_)), key = lambda x: x[1], reverse = True)[0:9]

In [50]:
sorted(list(zip(X_2017_ord.columns, model_lgbm.feature_importances_)), key = lambda x: x[1], reverse = True)[0:9]

[('dti', 0.03867791068096083),
 ('installment', 0.03776187875117656),
 ('annual_inc', 0.036839043741227775),
 ('term', 0.03467896746104897),
 ('loan_amnt', 0.034568461077529776),
 ('funded_amnt', 0.03379422303286396),
 ('avg_cur_bal', 0.03080102477877579),
 ('int_rate', 0.030793023067356276),
 ('tot_hi_cred_lim', 0.02896460619232284)]