In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score


GRID_SEARCH_PARAMS = {
    "max_depth": list(range(3, 15, 2)),
    "min_child_weight": list(range(1, 10, 2)),
    "eta": [0.01, 0.05, 0.1, 0.3, 0.5],
    "subsample": [0.1, 0.5, 0.9, 1],
    "colsample_bytree": [0.1, 0.5, 0.9, 1],
    "lambda": [0.1, 0.5, 1, 2, 5]
}
N_ESTIMATOR = 77

In [2]:
def clean_and_reformat(df):
    df = df.drop(['Name', 'City', 'Zip', 'BankState', 'Bank', 'State', 'ApprovalDate', 'DisbursementDate'], axis=1)
    df['ApprovalFY'] = df['ApprovalFY'].replace('1976A', 1976)
    df['ApprovalFY'] = df['ApprovalFY'].astype(int)

    df['LowDoc'] = np.where((df['LowDoc'] == "N") | (df['LowDoc'] == "Y"), df['LowDoc'], np.nan)
    df['LowDoc'] = df['LowDoc'].replace({'N': 0, 'Y': 1})
    df['LowDoc'].fillna(0, inplace=True)

    df['NewExist'] = df['NewExist'].replace(0, None)
    df['NewExist'].fillna(0, inplace=True)

    df['RevLineCr'] = np.where((df['RevLineCr'] == "N") | (df['RevLineCr'] == "Y"), df['RevLineCr'], np.nan)
    df['RevLineCr'] = df['RevLineCr'].replace({'N': 0, 'Y': 1})
    df['RevLineCr'].fillna(0, inplace=True)

    df['FranchiseCode'] = df['FranchiseCode'].replace(1, 0)
    df['FranchiseCode'] = np.where((df.FranchiseCode != 0), 1, 0)
    df.rename(columns={"FranchiseCode": "Is_Franchised"}, inplace=True)

    df['CreateJob'] = np.where((df.CreateJob > 0), 1, df.CreateJob)
    df.rename(columns={"CreateJob": "Is_CreatedJob"}, inplace=True)

    df['RetainedJob'] = np.where((df.RetainedJob > 0), 1, df.RetainedJob)
    df.rename(columns={"RetainedJob": "Is_RetainedJob"}, inplace=True)

    money_cols = ['DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv']
    df[money_cols] = df[money_cols].replace('[$,]', '', regex=True).astype(float)
    # df = df.dropna()

    df['LowDoc'] = df['LowDoc'].astype(int)
    df['NewExist'] = df['NewExist'].astype(int)
    df['RevLineCr'] = df['RevLineCr'].astype(int)

    return df


# must use the trained model
def predict(model):
    x_predict = pd.read_csv('Xtest.csv', dtype={"ApprovalFY": object})
    x_predict = clean_and_reformat(x_predict)
    predict_submission = model.predict(x_predict)
    submission = pd.DataFrame(predict_submission)
    submission.index.name = "Id"
    submission.to_csv("predict.csv", header=["ChargeOff"])


# must use the trained model
def feature_selection(model, x):
    feat_importance = pd.Series(model.feature_importances_, index=x.columns)
    plt.figure(figsize=(8, 4))
    feat_importance.sort_values().plot(kind='barh')
    plt.title('Feature Importance', fontsize=20)
    plt.ylabel('Features', fontsize=15)
    plt.xlabel('Score', fontsize=15)
    plt.show()


def tuning_best_rounds(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
    d_train = xgb.DMatrix(x_train, label=y_train)
    d_test = xgb.DMatrix(x_test, label=y_test)
    num_boost_round = 999
    xgb.train(d_train, num_boost_round=num_boost_round, evals=[(d_test, "Test")], early_stopping_rounds=10)


def cross_validation(x, y):
    model = XGBClassifier()
    gsc = GridSearchCV(
        estimator=model,
        param_grid=GRID_SEARCH_PARAMS,
        cv=10, scoring="roc_auc"
    )
    result = gsc.fit(x, y)
    best_params = result.best_params_

    return best_params


def test_performance_cv(model, x, y):
    k_fold = KFold(n_splits=10)
    results = cross_val_score(model, x, y, cv=k_fold, scoring="roc_auc")
    print("AUC: {}".format((round(results.mean(), 3))))


def test_performance(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(df.drop(['ChargeOff'], axis= 1), df["ChargeOff"], test_size = 0.25, random_state=0)
    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    print(classification_report(y_test, predict, digits=3))

In [3]:
df = pd.read_csv('Xtrain.csv')
df_y = pd.read_csv('Ytrain.csv')
df = pd.concat([df, df_y['ChargeOff']], axis=1, sort=False)
df = clean_and_reformat(df)

model = XGBClassifier(n_estimators=N_ESTIMATOR, learning_rate=0.15, colsample_bytree=0.5, gamma=0.0,
                          min_child_weight=1, max_depth=6)
y = df["ChargeOff"]
x = df.drop(['ChargeOff'], axis=1)
model.fit(x, y)
test_performance_cv(model, x, y)
test_performance(model, x, y)
predict(model)

AUC: 0.974
              precision    recall  f1-score   support

           0      0.921     0.919     0.920      6342
           1      0.917     0.918     0.918      6158

    accuracy                          0.919     12500
   macro avg      0.919     0.919     0.919     12500
weighted avg      0.919     0.919     0.919     12500

