In [50]:
import pandas as pd
import numpy as np
import seaborn as sb
import math
import scikitplot as skplt
import datetime
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score,auc,roc_curve,roc_auc_score, precision_recall_curve
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.impute import SimpleImputer
from datetime import timedelta


In [52]:
default_rate_by_industry = {92: 0.33, 53: 0.33, 52: 0.34, 61: 0.38, 49: 0.38, 48: 0.39, 51: 0.4, 
                            56: 0.4, 23: 0.42, 44: 0.43, 45: 0.43, 72: 0.44, 71: 0.45, 81: 0.46, 
                            31: 0.47, 42: 0.47, 54: 0.48, 22: 0.52, 32: 0.53, 33: 0.57, 62: 0.65, 
                            21: 0.68, 11: 0.7, 0: 0.7, 55: 0.76}

default_rate_by_state = {'FL': 0.36, 'TN': 0.41, 'GA': 0.41, 'SC': 0.42, 'AZ': 0.42, 'MI': 0.42, 
                         'NV': 0.43, 'IL': 0.43, 'KY': 0.45, 'MD': 0.45, 'NY': 0.46, 'DC': 0.46, 
                         'NJ': 0.47, 'UT': 0.48, 'CO': 0.48, 'TX': 0.48, 'CA': 0.49, 'VA': 0.49, 
                         'IN': 0.5, 'LA': 0.5, 'NC': 0.5, 'OH': 0.51, 'AR': 0.51, 'OK': 0.52, 
                         'DE': 0.52, 'HI': 0.53, 'MO': 0.54, 'MS': 0.54, 'AL': 0.55, 'OR': 0.55, 
                         'PA': 0.56, 'WA': 0.56, 'WV': 0.56, 'KS': 0.58, 'ID': 0.58, 'AK': 0.59, 
                         'MA': 0.59, 'CT': 0.59, 'WI': 0.6, 'MN': 0.61, 'NE': 0.62, 'RI': 0.63, 
                         'NM': 0.63, 'IA': 0.65, 'NH': 0.66, 'MT': 0.72, 'ME': 0.72, 'VT': 0.73, 
                         'SD': 0.75, 'WY': 0.76, 'ND': 0.77}

def get_state_code(state):
    #https://www.stateabbreviations.us/
    state_map = {'AK': 1, 'AL': 2, 'AR': 3, 'AZ': 4,'CA': 5, 'CO': 6, 'CT': 7, 'DC': 8,
                 'DE': 9, 'FL': 10, 'GA': 11, 'HI': 12, 'IA': 13, 'ID': 14, 'IL': 15, 'IN': 16, 
                 'KS': 17, 'KY': 18, 'LA': 19, 'MA': 20, 'MD': 21, 'ME': 22, 'MI': 23, 'MN': 24,
                 'MO': 25, 'MS': 26, 'MT': 27, 'NC': 28, 'ND': 29, 'NE': 30, 'NH': 31, 'NJ': 32,
                 'NM': 33, 'NV': 34, 'NY': 35, 'OH': 36, 'OK': 37, 'OR': 38, 'PA': 39, 'RI': 40,
                 'SC': 41, 'SD': 42, 'TN': 43, 'TX': 44, 'UT': 45, 'VA': 46, 'VT': 47, 'WA': 48,
                 'WI': 49, 'WV': 50, 'WY': 51}
    if state in state_map:
        return state_map[state]
    else:
        return 0


def predict(model, df):
    predict_submission = model.predict(df)
    submission = pd.DataFrame(predict_submission)
    submission.index.name = "Id"
    submission.to_csv("predict.csv", header=["ChargeOff"])
    

def train_and_predict(model, df_train, df_predict):
    # train
    y = df_train["ChargeOff"]
    x = df_train.drop(['ChargeOff'], axis= 1)
    model.fit(x, y)
    
    # test
    predict = model.predict(df_predict)
    submission = pd.DataFrame(predict)
    submission.index.name = "Id"
    submission.to_csv("predict.csv", header=["ChargeOff"])


def train(model, df):
    y = df["ChargeOff"]
    x = df.drop(['ChargeOff'], axis= 1)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    print(classification_report(y_test, predict, digits=3))
    print("AUC: {}".format(roc_auc_score(y_test, predict)))

    
def pipeline(model, df):
    y = df["ChargeOff"]
    x = df.drop(['ChargeOff'], axis= 1)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=2)
    pipe = Pipeline(steps=[('feature_selection', SelectKBest(chi2, k="all")), ('model', model)])
    pipe.fit(x_train, y_train)
    predict = pipe.predict(x_test)

    print(classification_report(y_test, predict, digits=3))
    
    for name, importance in sorted(zip(x.columns, model.feature_importances_)):
        print(name, "=", importance)


def preprocessing(df):
    df = df.drop(['Id', 'Name', 'City', 'Zip', 'BalanceGross'], axis= 1)
    
    # money columns
    currency_cols = ['DisbursementGross', 'GrAppv', 'SBA_Appv']
    df[currency_cols] = df[currency_cols].replace('[$,]', '', regex=True).astype(float)

    # ApprovalFY column
    df['ApprovalFY'] = df['ApprovalFY'].replace('1976A', 1976)
    df['ApprovalFY'] = df['ApprovalFY'].astype(int)
    
    # NAICS
    df['NAICS'] = df['NAICS'].replace(default_rate_by_industry)
    df.rename(columns={"NAICS": "Def_Rate_Industry"}, inplace=True)
    
    # State, Bank and BankState
    df['Bank_Same_State'] = np.where(df['State'] == df['BankState'], 1, 0)
    df=pd.concat([df, pd.get_dummies(df['State'], prefix='State')], axis=1)
    df= df.drop(['State'], axis =1 )
    # df['State'] = df['State'].replace(default_rate_by_state)
    # df['State'] = np.where(df['State'].isnull(), 0.54, df['State'])
    # df.rename(columns={"State": "Def_Rate_State"}, inplace=True)
    
    banks = {v: k for k, v in enumerate(list(df['Bank'].unique()))}
    df['Bank'].replace(banks, inplace=True)
    bank_states = {v: k for k, v in enumerate(list(df['BankState'].unique()))}
    df['BankState'].replace(bank_states, inplace=True)
    
    # FranchiseCode
    df["FranchiseCode"] = df["FranchiseCode"].apply(lambda x: x != 0 and x != 1)
    df.rename(columns={"FranchiseCode": "Franchised"}, inplace=True)
    
    # LowDoc column
    df['LowDoc'] = np.where((df['LowDoc'] == "N") | (df['LowDoc'] == "Y"), df['LowDoc'], np.nan)
    df['LowDoc'] = df['LowDoc'].replace({'N': 0, 'Y': 1})
    df['LowDoc'] = np.where((df['LowDoc'].isnull()) & (df['DisbursementGross'] < 150000), 1, df['LowDoc'])
    df['LowDoc'] = np.where((df['LowDoc'].isnull()) & (df['DisbursementGross'] >= 150000), 0, df['LowDoc'])
    
    # New Exist column
    df['NewExist'] = np.where((df['NewExist'] == 2) | (df['NewExist'] == 1), df['NewExist'], np.nan)
    df['NewExist'] = df['NewExist'].replace({2: 1, 1: 0})
    
    
    # RevLineCr column
    df['RevLineCr'] = np.where((df['RevLineCr'] == "N") | (df['RevLineCr'] == "Y"), df['RevLineCr'], np.nan)
    df['RevLineCr'] = df['RevLineCr'].replace({'N': 0, 'Y': 1})
    
    df['RevLineCr'] = np.where(df['RevLineCr'].isnull(), df['RevLineCr'].mode(), df['RevLineCr'])
    df['NewExist'] = np.where(df['NewExist'].isnull(), df['NewExist'].mode(), df['NewExist'])
    


    # date columns
    df[['ApprovalDate', 'DisbursementDate']] = df[['ApprovalDate', 'DisbursementDate']].apply(pd.to_datetime)
    df['DisbursementDate'] = np.where(df['DisbursementDate'].isnull(), df['ApprovalDate'] + timedelta(days=94), df['DisbursementDate'])

    # two recession calculation
    df['DisbursementFY'] = df['DisbursementDate'].map(lambda x: x.year)
    df['GreatRecession'] = np.where(((2007 <= df['DisbursementFY']) & (df['DisbursementFY'] <= 2009)) | 
                                    ((df['DisbursementFY'] < 2007) & 
                                     (df['DisbursementFY'] + (df['Term']/12) >= 2007)), 1, 0)
    
    # SVA vs Gross
    df['SBA_vs_Gross'] = df['SBA_Appv']/df['GrAppv']
    
    # AppvDisbursed
    df['Appv_Disbur_Same'] = np.where(df['DisbursementGross'] == df['GrAppv'], 1, 0)

    # RealEstate
    df['RealEstate'] = np.where(df['Term'] >= 240, 1, 0)
    
    df["CreateJob"] = df["CreateJob"].apply(lambda x: x != 0)
    df["RetainedJob"] = df["RetainedJob"].apply(lambda x: x != 0)
        
    df = df.drop(['ApprovalDate', 'DisbursementDate', 'DisbursementFY'], axis= 1)
    
    return df

In [53]:
df_train = pd.read_csv('Xtrain.csv', dtype={"ApprovalFY": object})
df_y = pd.read_csv('Ytrain.csv')
df_train = pd.concat([df_train, df_y['ChargeOff']], axis=1, sort=False)
df_train = preprocessing(df_train)

In [54]:
df_new = df_train
# df_new = df_train[["RevLineCr", "Term","SBA_vs_Gross", "GrAppv", "Def_Rate_State", "DisbursementGross", "RetainedJob"
#                 ,"Def_Rate_Industry", "GreatRecession", "CreateJob", "ChargeOff"]]
model = XGBClassifier()
train(model, df_new)

              precision    recall  f1-score   support

           0      0.915     0.903     0.909      5101
           1      0.900     0.913     0.906      4899

    accuracy                          0.908     10000
   macro avg      0.908     0.908     0.908     10000
weighted avg      0.908     0.908     0.908     10000

AUC: 0.9076996977846841


In [43]:

def optimal_model(clf, params,x_train,y_train, x_test, y_test):
    
    search = GridSearchCV(estimator=clf,
                          param_grid=params,
                          scoring = 'f1',
                          n_jobs = -1,
                          cv = 3,
                          verbose=True)

    
    search.fit(x_train, y_train)
    
    best = search.best_estimator_
    best_model = best.fit(x_train, y_train)
    print('Best parameters: \n',search.best_params_)
    print('='*70)
    
    y_test_ypred = best_model.predict(x_test)
    print('Classification Report: \n', classification_report(y_test, y_test_ypred, digits=3))
    print('='*100)
    return best_model

    
    

In [44]:
#For finding best parameters
# params ={"max_depth"        : [ 6, 7, 8, 9],
#          "min_child_weight" : [ 1, 3, 5, 7 ],
#          "gamma"            : [ 0.0, 0.1, 0.2 ],
#          "colsample_bytree" : [ 0.3, 0.4, 0.5],
#          "n_estimators"     : [ 150, 200, 250, 300]}
# 
# 
# 
# xgb = XGBClassifier(base_score = 0.5, booster = 'gbtree', 
#                     colsample_bylevel = 1, colsample_bynode = 1, 
#                     colsample_bytree = 0.5, gamma = 0.0,
#                     learning_rate = 0.15, max_delta_step = 0, 
#                     max_depth = 6, min_child_weight = 1, 
#                     missing = None, n_estimators = 150, 
#                     n_jobs = 1, objective ='binary:logistic',
#                     reg_alpha = 0, reg_lambda = 1, scale_pos_weight = 1, 
#                     subsample = 1, verbosity = 1)
# 
# y = df_new["ChargeOff"]
# x = df_new.drop(['ChargeOff'], axis= 1)
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=2)
# best_model = optimal_model(xgb, params, x_train, y_train, x_test, y_test)

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Best parameters: 
 {'colsample_bytree': 0.5, 'gamma': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 250}
Classification Report: 
               precision    recall  f1-score   support

           0      0.971     0.956     0.964      2574
           1      0.955     0.969     0.962      2426

    accuracy                          0.963      5000
   macro avg      0.963     0.963     0.963      5000
weighted avg      0.963     0.963     0.963      5000



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   56.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 22.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 1728 out of 1728 | elapsed: 57.8min finished


In [56]:
#Best parameters: 
 #{'colsample_bytree': 0.5, 'gamma': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 250}
xgb = XGBClassifier(base_score = 0.5, booster = 'gbtree', 
                    colsample_bylevel = 1, colsample_bynode = 1, 
                    colsample_bytree = 0.5, gamma = 0.1,
                    learning_rate = 0.15, max_delta_step = 0, 
                    max_depth = 7, min_child_weight = 5, 
                    missing = None, n_estimators = 250, 
                    n_jobs = 1, objective ='binary:logistic',
                    reg_alpha = 0, reg_lambda = 1, scale_pos_weight = 1, 
                    subsample = 1, verbosity = 1)

train(xgb, df_new)

              precision    recall  f1-score   support

           0      0.944     0.934     0.939      5101
           1      0.932     0.942     0.937      4899

    accuracy                          0.938     10000
   macro avg      0.938     0.938     0.938     10000
weighted avg      0.938     0.938     0.938     10000

AUC: 0.9379857957240872


In [45]:
# df_test = pd.read_csv('Xtest.csv', dtype={"ApprovalFY": object})
# df_test = preprocessing(df_test)

In [46]:
# predict = best_model.predict(df_test)
# submission = pd.DataFrame(predict)
# submission.index.name = "Id"
# submission.to_csv("predict.csv", header=["ChargeOff"])