In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [6]:
def categorize_no_emp(number):
    if number <= 10:
        return 0
    elif 10 < number <= 100:
        return 1
    else:
        return 2


def categorize_year(year):
    if year <= 1988:
        return 0
    elif 1989 <= year <= 2003:
        return 1
    elif 2004 <= year <= 2009:
        return 2
    else:
        return 3


def categorize_disbursement(number):
    if number <= 100000:
        return 0
    elif 100000 < number <= 500000:
        return 1
    else:
        return 2


def categorize_sba_bank(number):
    categories = [0.5, 0.75, 0.85, 0.9, 0.8, 1.0, 0.7]
    return min(categories, key=lambda x:abs(x-number))


def pre_processing(df, categorized=True):
    df = df.drop(["Id", "Name", "City", "Zip", "ApprovalDate", "DisbursementDate", "BalanceGross"], axis=1)

    # convert to numeric
    states = {v: k for k, v in enumerate(list(df["State"].unique()))}
    df["State"].replace(states, inplace=True)
    banks = {v: k for k, v in enumerate(list(df["Bank"].unique()))}
    df["Bank"].replace(banks, inplace=True)
    bank_states = {v: k for k, v in enumerate(list(df["BankState"].unique()))}
    df["BankState"].replace(bank_states, inplace=True)
    df["Same_Bank_States"] = df["State"] == df["BankState"]
    df["NAICS"] = df["NAICS"].apply(lambda x: x // 10000)

    # cleaning
    df["ApprovalFY"] = df["ApprovalFY"].replace("1976A", 1976)
    df["ApprovalFY"] = df["ApprovalFY"].astype(int)
    money_cols = ["DisbursementGross", "GrAppv", "SBA_Appv"]
    df[money_cols] = df[money_cols].replace("[\$,]", "", regex=True).astype(float)
    df['LowDoc'] = np.where((df['LowDoc'] == "N") | (df['LowDoc'] == "Y"), df['LowDoc'], np.nan)
    df['LowDoc'] = df['LowDoc'].replace({'N': 0, 'Y': 1})
    df['RevLineCr'] = np.where((df['RevLineCr'] == "N") | (df['RevLineCr'] == "Y"), df['RevLineCr'], np.nan)
    df['RevLineCr'] = df['RevLineCr'].replace({'N': 0, 'Y': 1})

    # convert to categorized data
    df["ApprovalFY_Cat"] = df["ApprovalFY"].apply(categorize_year)
    df["NoEmp_Cat"] = df["NoEmp"].apply(categorize_no_emp)
    df["CreateJob_Cat"] = df["CreateJob"].apply(lambda x: x != 0)
    df["RetainedJob_Cat"] = df["RetainedJob"].apply(lambda x: x != 0)
    df["FranchiseCode_Cat"] = df["FranchiseCode"].apply(lambda x: x != 0)
    df["DisbursementGross_Cat"] = df["DisbursementGross"].apply(categorize_disbursement)
    df["SBA_vs_Gross"] = df["SBA_Appv"] / df["GrAppv"]

    if categorized:
        return df[["State", "Bank", "BankState", "NAICS", "ApprovalFY_Cat", "Term", "NoEmp", "NewExist",
                   "CreateJob_Cat", "RetainedJob_Cat", "FranchiseCode_Cat", "UrbanRural", "RevLineCr", "LowDoc",
                   "DisbursementGross", "SBA_vs_Gross"]]
    else:
        return df[["State", "Bank", "BankState", "NAICS", "ApprovalFY", "Term", "NoEmp", "NewExist",
                   "CreateJob", "RetainedJob", "FranchiseCode", "UrbanRural", "RevLineCr", "LowDoc",
                   "DisbursementGross", "SBA_vs_Gross"]]


def predict(model, categorized=True):
    x_predict = pd.read_csv('Xtest.csv', dtype={"ApprovalFY": object})
    x_predict = pre_processing(x_predict, categorized=categorized)
    predict_submission = model.predict(x_predict)
    submission = pd.DataFrame(predict_submission)
    submission.index.name = "Id"
    submission.to_csv("predict.csv", header=["ChargeOff"])


def test_performance_cv(model, x, y):
    k_fold = KFold(n_splits=10)
    results = cross_val_score(model, x, y, cv=k_fold, scoring="roc_auc")
    print("CV Score: {}".format((round(results.mean(), 3))))


def test_performance(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    print(classification_report(y_test, predict, digits=3))

In [8]:
df = pd.read_csv("Xtrain.csv")
df = pre_processing(df, categorized=True)
df_y = pd.read_csv("Ytrain.csv")

model = XGBClassifier(n_estimators=77, learning_rate=0.15, colsample_bytree=1, gamma=0.0,
                      min_child_weight=1, max_depth=10)
y = df_y["ChargeOff"]
x = df
model.fit(x, y)

test_performance(model, x, y)
test_performance_cv(model, x, y)

predict(model, categorized=True)

              precision    recall  f1-score   support

           0      0.932     0.932     0.932      6342
           1      0.930     0.930     0.930      6158

    accuracy                          0.931     12500
   macro avg      0.931     0.931     0.931     12500
weighted avg      0.931     0.931     0.931     12500

CV Score: 0.979
