# Scoring function

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, precision_score, accuracy_score

def score(y_true, y_pred):
    A = confusion_matrix(y_true, y_pred)
    print(
    f''' 
    F1 score              | {round(f1_score(y_true, y_pred)*100, 2)}        \n
    -----------------------------
    Precision score       | {round(precision_score(y_true, y_pred)*100, 2)} \n
    -----------------------------
    Accuracy              | {round(accuracy_score(y_true, y_pred)*100, 2)}  \n
    -----------------------------
    Confusion matrix | {A[0,0]} {A[0,1]}
                       {A[1,0]} {A[1,1]}
    '''
    )

# Preprocessing

In [None]:
import pandas as pd
from joblib import load

ord_encoder     = load('./models/encoders/ordinal_encoder.joblib')
one_hot_encoder = load('./models/encoders/one_hot_encoder.joblib')
std_sc          = load('./models/encoders/standard_scaler.joblib')

def preprocessing(df):
    df = df.drop(index=df.loc[df['loan_status']=='Current', :].index)
    X = df.loc[df['sub_grade'].apply(lambda row: False if row[0] in ['A', 'B','C'] else True), :]
    y = X['loan_status'].replace({'Defaulted':0, 'FullyPaid':1})
    X = X.iloc[:, :-1]
    
    X_ord  = ord_encoder.transform(X)

    X_oh   = one_hot_encoder.transform(X)

    drop_first = ['term_1', 'home_ownership_1', 'purpose_1','addr_state_1',
                  'initial_list_status_1','sub_grade_1','emp_length_1']
    X_oh  = X_oh.drop(columns = drop_first)
    
    X_oh   = std_sc.transform(X_oh)
    
    return X_ord, X_oh, y

# Load models

In [None]:
from tensorflow.keras.models import load_model
from joblib import load

model_rf    = load('./models/RandomForest_1.joblib')
model_xgb   = load('./models/XGBoost_1.joblib')
model_rf2   = load('./models/RandomForest_2.joblib')
model_xgb2  = load('./models/XGBoost_2.joblib')
model_lgbm  = load('./models/LGBM_1.joblib')
model_logit = load('./models/logit.joblib')
model_nn    = load_model('./models/shallowNN_1')

models_ord = [model_rf, model_xgb, model_rf2, model_xgb2, model_lgbm] 
models_oh  = [model_logit, model_nn]

# Ensemble models

In [None]:
from functools import reduce

def ensemble(X_ord, X_oh, models_ord, models_oh, treshold = None):
    if treshold == None:
        treshold = (len(models_ord)+ len(models_oh))//2 + 1
    predictions = []
    for model in models_ord:
        predictions.append(pd.Series(model.predict(X_ord)))
    for model in models_oh:
        predictions.append(pd.Series(model.predict(X_oh).reshape(-1)).apply(lambda x: 1 if x > .5 else 0))
    
    y_sum = reduce(lambda x, y: x+y, predictions)
    return y_sum.apply(lambda x: 1 if x >= treshold else 0)

# Testing

In [None]:
test2017 =  pd.read_csv('/Users/ivanpassoni/Google Drive/LendingClubData/ml datasets/accepted_2017_ml.csv',
                        low_memory = False)

X_2017_ord, X_2017_oh, y = preprocessing(test2017)

y_pred = ensemble(X_2017_ord, X_2017_oh, models_ord, models_oh, 7)

score(y, y_pred)

# Base
import numpy as np
print('Base case:')
score(y, pd.Series(np.ones(len(y))))