In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import math
import scikitplot as skplt
import datetime
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix, f1_score,auc,roc_curve,roc_auc_score, precision_recall_curve
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
def categorize_year(year):
    if year <= 1988:
        return 1
    elif year >= 1989 and year <= 2004:
        return 2
    elif year >= 2005 and year <= 2009:
        return 3
    else:
        return 4


def categorize_job(job):
    if job <= 0:
        return 0
    elif job >= 1 and job <= 5:
        return 1
    else:
        return 0


def state_def_rate(i):
    def_state = {'AK': 0.1,'AL': 0.2, 'AR': 0.2, 'AZ': 0.2,'CA': 0.2, 'CO': 0.2, 'CT': 0.1, 'DC': 0.2,
                 'DE': 0.2, 'FL': 0.3, 'GA': 0.2, 'HI': 0.2, 'IA': 0.1, 'ID': 0.1, 'IL': 0.2, 'IN': 0.2, 
                 'KS': 0.1, 'KY': 0.2, 'LA': 0.2, 'MA': 0.1, 'MD': 0.2, 'ME': 0.1, 'MI': 0.2, 'MN': 0.1,
                 'MO': 0.2, 'MS': 0.2, 'MT': 0.1, 'NC': 0.2, 'ND': 0.1, 'NE': 0.1, 'NH': 0.1, 'NJ': 0.2,
                 'NM': 0.1, 'NV': 0.2, 'NY': 0.2, 'OH': 0.2, 'OK': 0.2, 'OR': 0.2, 'PA': 0.1, 'RI': 0.1,
                 'SC': 0.2, 'SD': 0.1, 'TN': 0.2, 'TX': 0.2, 'UT': 0.2, 'VA': 0.2, 'VT': 0.1, 'WA': 0.1,
                 'WI': 0.1, 'WV': 0.2, 'WY': 0.1}
    if i in def_state:
        return def_state[i]

    
def def_rate(i):
    
    sector_default = {21:0.08, 11:0.09, 55:0.10, 
                      62: 0.10, 22:0.14, 
                      92:0.15,54:0.19, 
                      42:0.19,31:0.19,
                      32:0.16,33:0.14,
                      81:0.20,71:0.21,
                      72:0.22,44:0.22,
                      45:0.23,23:0.23,
                      56:0.24,61:0.24,
                      51:0.25,48:0.27,
                      49:0.23,52:0.28,53:0.29}
    
    if i in sector_default:
        return sector_default[i]


def pre_processing(df, train=True):
    df = df.drop(["Id", "Name", "City", "Zip", "ApprovalDate", "BalanceGross"], axis=1)
    if train:
        df = df.dropna()

    # convert to numeric
    df['State_rate'] = df.State.apply(state_def_rate)
    states = {v: k for k, v in enumerate(list(df['State'].unique()))}
    df['State'].replace(states, inplace=True)
    banks = {v: k for k, v in enumerate(list(df['Bank'].unique()))}
    df['Bank'].replace(banks, inplace=True)
    bank_states = {v: k for k, v in enumerate(list(df['BankState'].unique()))}
    df['BankState'].replace(bank_states, inplace=True)
    df["NAICS"] = df["NAICS"].apply(lambda x: x // 10000)
    df['Sector_Rate'] = df.NAICS.apply(def_rate)

    # cleaning
    df['ApprovalFY'] = df['ApprovalFY'].replace('1976A', 1976)
    df['ApprovalFY'] = df['ApprovalFY'].astype(int)

    money_cols = ['DisbursementGross', 'GrAppv', 'SBA_Appv']
    df[money_cols] = df[money_cols].replace('[\$,]', '', regex=True).astype(float) 

    df['LowDoc'] = np.where((df['LowDoc'] == "N") | (df['LowDoc'] == "Y"), df['LowDoc'], np.nan)
    df['LowDoc'] = df['LowDoc'].replace({'N': 0, 'Y': 1})
    df['LowDoc'] = np.where((df['LowDoc'].isnull()) & (df['DisbursementGross'] < 150000), 1, df['LowDoc'])
    df['LowDoc'] = np.where((df['LowDoc'].isnull()) & (df['DisbursementGross'] >= 150000), 0, df['LowDoc'])

    df['RevLineCr'] = np.where((df['RevLineCr'] == "N") | (df['RevLineCr'] == "Y"), df['RevLineCr'], np.nan)
    df['RevLineCr'] = df['RevLineCr'].replace({'N': 0, 'Y': 1})

    df['NoEmp'] = np.where((df['NoEmp'] == 0), np.nan, df['NoEmp'])
    df['DisbursementGross'] = np.where((df['DisbursementGross'] == 0), np.nan, df['DisbursementGross'])
    
    if train:
        df = df.dropna()
        df['DisbursementGross'] = np.log(df['DisbursementGross'])
        outliers1_drop = df[(df['DisbursementGross'] > 14.9)].index
        df.drop(outliers1_drop, inplace=True)
    
    # transformation, creation
    df["SBA_vs_Gross"] = df["SBA_Appv"] / df["GrAppv"]
    df['RealEstate'] = df['Term'].apply(lambda x: 1 if x >= 240 else 0)

    # convert to categorized data
    df["ApprovalFY_Cat"] = df["ApprovalFY"].apply(categorize_year)
    df["CreateJob_Cat"] = df["CreateJob"].apply(categorize_job)
    df["RetainedJob_Cat"] = df["RetainedJob"].apply(lambda x: x != 0)
    df["FranchiseCode_Cat"] = df["FranchiseCode"].apply(lambda x: x != 0)
    


    if train:
        return df[['Bank', 'BankState', 'Term', 'NewExist', 'RevLineCr', 'DisbursementGross', 'GrAppv', 'SBA_Appv', 
                   'ChargeOff', 'State_rate', 'Sector_Rate', 'ApprovalFY_Cat', 'CreateJob_Cat', 'RetainedJob_Cat', 
                   'FranchiseCode_Cat', 'SBA_vs_Gross']]
    else:
        return df[['Bank', 'BankState', 'Term', 'NewExist', 'RevLineCr', 'DisbursementGross', 'GrAppv', 'SBA_Appv', 
                   'State_rate', 'Sector_Rate', 'ApprovalFY_Cat', 'CreateJob_Cat', 'RetainedJob_Cat', 
                   'FranchiseCode_Cat', 'SBA_vs_Gross']]

def predict(model, df):
    predict_submission = model.predict(df)
    submission = pd.DataFrame(predict_submission)
    submission.index.name = "Id"
    submission.to_csv("predict.csv", header=["ChargeOff"])


def train(model, df):
    y = df["ChargeOff"]
    x = df.drop(['ChargeOff'], axis= 1)
    model.fit(x,y)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)
    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    print(classification_report(y_test, predict, digits=3))
    print("AUC: {}".format(roc_auc_score(y_test, predict)))

In [3]:
df = pd.read_csv('Xtrain.csv')
df_y = pd.read_csv('Ytrain.csv')
df = pd.concat([df, df_y['ChargeOff']], axis=1, sort=False)
df = pre_processing(df)

xgb = XGBClassifier(n_estimators=77, learning_rate=0.15, colsample_bytree=1, gamma=0.0,
                    min_child_weight=1, max_depth=10, reg_lambda=1)
# xgb = XGBClassifier()

train(xgb, df)

# predict
x_predict = pd.read_csv('Xtest.csv', dtype={"ApprovalFY": object})
x_predict = pre_processing(x_predict, train=False)
predict(xgb, x_predict)

              precision    recall  f1-score   support

           0      0.932     0.927     0.930      1212
           1      0.943     0.947     0.945      1541

    accuracy                          0.938      2753
   macro avg      0.938     0.937     0.937      2753
weighted avg      0.938     0.938     0.938      2753

AUC: 0.9370902697018566
