In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 50

# Read
We read the cleaned accepted loans strictly before 2015. We train our models here. We will focus only on 'high' risk loans, that is loans with grade 'D' or below.

In [None]:
accepted = pd.read_csv('/Users/ivanpassoni/Google Drive/LendingClubData/ml datasets/accepted_b_2015_ml.csv',
                       low_memory = False)

In [None]:
accepted.drop(index = accepted.loc[accepted['loan_status']=='Current', :].index, inplace = True)

# Preprocessing for training

In [None]:
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

In [None]:
# Selecting high risk loans (Loans with grade not in ['A', 'B','C'])
X = accepted.loc[accepted['sub_grade'].apply(lambda row: False if row[0] in ['A', 'B','C'] else True), :]
y = X['loan_status'].replace({'Defaulted':0, 'FullyPaid':1})
X = X.iloc[:, :-1]

random_state = 42
test_size    = .2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, 
                                                    random_state=random_state, 
                                                    stratify=y)

In [None]:
#Encoding the dataset
## Ordinal Encoder
cat_columns = ['term', 'home_ownership', 'purpose', 'addr_state', 'initial_list_status']
ordinal_columns = ['sub_grade', 'emp_length']

sub_grades = sorted(X['sub_grade'].unique())
sub_grade_map = {e:i for i, e in enumerate(sub_grades)}

emp_lengths = sorted(X['emp_length'].unique())
emp_length_map = {e:i for i, e in zip([1, 10, 2, 3, 4, 5, 6, 7, 8, 9, 0], emp_lengths)}


mapping=[{'col':'sub_grade' , 'mapping':sub_grade_map}, 
         {'col':'emp_length', 'mapping':emp_length_map}]

for col in cat_columns:
    col_map = {e:i for i, e in enumerate(sorted(X[col].unique()))}
    mapping.append({'col':col, 'mapping': col_map})

ord_encoder = ce.OrdinalEncoder(cols= cat_columns + ordinal_columns, 
                                mapping=mapping)

ord_encoder.fit(X_train, y_train)

In [None]:
## One Hot Encoder
cat_columns = ['term', 'home_ownership', 'purpose', 'addr_state', 'initial_list_status']
ordinal_columns = ['sub_grade', 'emp_length']

one_hot_encoder = ce.OneHotEncoder(cols = cat_columns + ordinal_columns)

one_hot_encoder.fit(X_train, y_train)

In [None]:
X_ordtrain  = ord_encoder.transform(X_train)
X_ordtest   = ord_encoder.transform(X_test)

X_ohtrain = one_hot_encoder.transform(X_train)
X_ohtest  = one_hot_encoder.transform(X_test)

In [None]:
# droping the first column of the dummification
drop_first = ['term_1', 'home_ownership_1', 'purpose_1','addr_state_1',
              'initial_list_status_1','sub_grade_1','emp_length_1']
X_ohtrain  = X_ohtrain.drop(columns = drop_first)
X_ohtest   = X_ohtest.drop(columns = drop_first)

In [None]:
#reescaling the columns of the one hot encoding
std_sc = StandardScaler()
std_sc.fit(X_ohtrain, y_train)
X_ohtrain = std_sc.transform(X_ohtrain)
X_ohtest  = std_sc.transform(X_ohtest)

In [None]:
# Saving the scaling for preprocessing
from joblib import dump

# dump(ord_encoder,     './models/encoders/ordinal_encoder.joblib')
# dump(one_hot_encoder, './models/encoders/one_hot_encoder.joblib')
# dump(std_sc,          './models/encoders/standard_scaler.joblib')

### Creating a preprocessing function

In [None]:
from joblib import load

ord_encoder     = load('./models/encoders/ordinal_encoder.joblib')
one_hot_encoder = load('./models/encoders/one_hot_encoder.joblib')
std_sc          = load('./models/encoders/standard_scaler.joblib')

def preprocessing(df):
    df = df.drop(index=df.loc[df['loan_status']=='Current', :].index)
    X = df.loc[df['sub_grade'].apply(lambda row: False if row[0] in ['A', 'B','C'] else True), :]
    y = X['loan_status'].replace({'Defaulted':0, 'FullyPaid':1})
    X = X.iloc[:, :-1]
    
    X_ord  = ord_encoder.transform(X)

    X_oh   = one_hot_encoder.transform(X)

    drop_first = ['term_1', 'home_ownership_1', 'purpose_1','addr_state_1',
                  'initial_list_status_1','sub_grade_1','emp_length_1']
    X_oh  = X_oh.drop(columns = drop_first)
    
    X_oh   = std_sc.transform(X_oh)
    
    return X_ord, X_oh, y

# Scoring Function

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, precision_score, accuracy_score

def score(y_true, y_pred):
    A = confusion_matrix(y_true, y_pred)
    print(
    f''' 
    F1 score              | {round(f1_score(y_true, y_pred)*100, 2)}        \n
    -----------------------------
    Precision score       | {round(precision_score(y_true, y_pred)*100, 2)} \n
    -----------------------------
    Accuracy              | {round(accuracy_score(y_true, y_pred)*100, 2)}  \n
    -----------------------------
    Confusion matrix | {A[0,0]} {A[0,1]}
                       {A[1,0]} {A[1,1]}
    '''
    )

## Building neural network

In [None]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

In [None]:
model_1 = keras.Sequential()
my_init = keras.initializers.RandomNormal(mean=0.0, stddev=4, seed=None)
model_1.add(layers.Input(shape=(146,)))

model_1.add(layers.Dense(12, activation = 'tanh', name='dense_1', kernel_regularizer=keras.regularizers.l2(0.001)))

model_1.add(layers.Dense(1, activation='sigmoid', name='predictions'))

In [None]:
model_1.compile(optimizer=keras.optimizers.SGD(learning_rate=1e-3), 
                     loss='binary_crossentropy',
                     metrics=['accuracy', keras.metrics.Precision()])

In [None]:
model_1.summary()

In [None]:
y_train.value_counts()[1]/y_train.value_counts()[0]

In [None]:
weights = {0: 2.46, 1:1} ## 2.46 = y_train.value_counts()[1]/y_train.value_counts()[0]

model_1.fit(np.array(X_ohtrain), np.array(y_train), epochs=500, batch_size=200,  class_weight=weights, 
            validation_data=(np.array(X_ohtest), np.array(y_test)))

In [None]:
# model_1.save('./models/shallowNN_1')

# Model training

In [None]:
## All parameters were obtained doing many grid searches. The code for this gridsearch is not here

rf_params={'max_depth': 15,
           'max_features': 5,
           'max_leaf_nodes': None,
           'min_impurity_decrease': 0.0,
           'min_samples_leaf': 3,
           'min_samples_split': 6,
           'n_estimators': 2000}

xgb_params={'alpha': 0.01,
            'gamma': 0,
           'max_depth': 11,
           'min_child_weight': 8,
           'n_estimators': 2200}


rf_params2 = {'max_depth': 8,
              'max_features': 11,
              'max_leaf_nodes': None,
              'min_impurity_decrease': 0.0,
              'min_samples_leaf': 3,
              'min_samples_split': 6,
              'n_estimators': 2500}

xgb_params2 = {'colsample_bylevel': 0.7,
               'max_depth': 7,
               'n_estimators': 2500,
                'reg_lambda': 1.0,
                'scale_pos_weight': 0.6228464256117242,
                'subsample': 0.8}
 
lgbm_params =  {'max_depth': 8,
 'min_child_samples': 6,
 'n_estimators': 2500,
 'reg_lambda': 0.67}

logit_params = {
    'C':2.891111111111111,
    'penalty':'l1'
}

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import  XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
model_rf  = RandomForestClassifier(bootstrap=True, class_weight='balanced', 
                                   criterion='gini', oob_score=False, **rf_params)

neg_pos_ratio = y_train.value_counts()[0]/y_train.value_counts()[1]

model_xgb = XGBClassifier(booster = "gbtree", objective = "binary:logistic",
                               lerning_rate = 0.1, n_jobs = -1, scale_pos_weight = neg_pos_ratio,
                               **xgb_params)

model_logit = LogisticRegression(max_iter=1e6, warm_start=True, class_weight = 'balanced',
                                 **logit_params)

In [None]:
model_rf2  = RandomForestClassifier(bootstrap=True, class_weight='balanced', 
                                   criterion='gini', oob_score=False, **rf_params2)


model_xgb2 = XGBClassifier(booster = "gbtree", objective = "binary:logistic",
                               lerning_rate = 0.1, n_jobs = -1,
                               **xgb_params2)

model_lgbm = LGBMClassifier(learning_rate = 1e-2, class_weight = 'balanced', 
                            importance_type = 'gain', **lgbm_params)

In [None]:
# Takes a long time
model_rf.fit(X_ordtrain, y_train)
model_xgb.fit(X_ordtrain, y_train)

In [None]:
model_rf2.fit(X_ordtrain, y_train)
model_xgb2.fit(X_ordtrain, y_train)
model_lgbm.fit(X_ordtrain, y_train)

In [None]:
model_logit.fit(X_ohtrain, y_train)

In [None]:
from joblib import dump

In [None]:
# dump(model_rf, './models/RandomForest_1.joblib')
# dump(model_xgb, './models/XGBoost_1.joblib')
# dump(model_rf2, './models/RandomForest_2.joblib')
# dump(model_xgb2, './models/XGBoost_2.joblib')
# dump(model_lgbm, './models/LGBM_1.joblib')
# dump(model_logit, './models/logit.joblib')

# ensemble

In [None]:
from tensorflow.keras.models import load_model
from joblib import load

In [None]:
m_rf    = load('./models/RandomForest_1.joblib')
m_xgb   = load('./models/XGBoost_1.joblib')
m_rf2   = load('./models/RandomForest_2.joblib')
m_xgb2  = load('./models/XGBoost_2.joblib')
m_lgbm  = load('./models/LGBM_1.joblib')
m_logit = load('./models/logit.joblib')
m_nn    = load_model('./models/shallowNN_1')

In [None]:
models_ord = [m_rf, m_xgb, m_rf2, m_xgb2, m_lgbm] 
models_oh  = [m_logit, m_nn]

In [None]:
from functools import reduce

def ensemble(X_ord, X_oh, models_ord, models_oh, treshold = None):
    if treshold == None:
        treshold = (len(models_ord) + len(models_oh))//2 + 1
    predictions = []
    for model in models_ord:
        predictions.append(pd.Series(model.predict(X_ord)))
    for model in models_oh:
        predictions.append(pd.Series(model.predict(X_oh).reshape(-1)).apply(lambda x: 1 if x > .5 else 0))
    
    print(list(map(len, predictions)))
    y_sum = reduce(lambda x, y: x+y, predictions)
    return y_sum.apply(lambda x: 1 if x >= treshold else 0)

In [None]:
y_pred = ensemble(X_ordtest, X_ohtest, models_ord, models_oh, 7)

In [None]:
score(y_test, y_pred)

# Testing on other datasets

In [None]:
test2015 =  pd.read_csv('/Users/ivanpassoni/Google Drive/LendingClubData/ml datasets/accepted_2015_ml.csv',
                        low_memory = False)

In [None]:
X_2015_ord, X_2015_oh, y = preprocessing(test2015)

In [None]:
y_pred = ensemble(X_2015_ord, X_2015_oh, models_ord, models_oh, 7)

In [None]:
score(y, y_pred)

In [None]:
# Base
score(y, pd.Series(np.ones(len(y))))

In [None]:
test2016 =  pd.read_csv('/Users/ivanpassoni/Google Drive/LendingClubData/ml datasets/accepted_2016_ml.csv',
                        low_memory = False)

X_2016_ord, X_2016_oh, y_2016 = preprocessing(test2016)

y_pred_2016 = ensemble(X_2016_ord, X_2016_oh, models_ord, models_oh, 7)

score(y_2016, y_pred_2016)

# Base
score(y_2016, pd.Series(np.ones(len(y_2016))))