# Default Credit Cards Clients

## DATA ENGINEERING

In [19]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, KFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from skopt import BayesSearchCV
import plotly.graph_objs as go
from sklearn.ensemble import RandomForestRegressor
from imblearn.over_sampling import SMOTE

In [2]:
data = pd.read_csv("train_series.csv")
data.MONTH.unique()
data

Unnamed: 0,ID,MONTH,PAY,BILL_AMT,PAY_AMT
0,9910,JUNE,0.0,76885.0,3500.0
1,9910,MAY,0.0,79106.0,4000.0
2,9910,APRIL,0.0,81231.0,3000.0
3,9910,MARCH,0.0,81983.0,3146.0
4,9910,FEBRUARY,0.0,83773.0,3260.0
...,...,...,...,...,...
119995,19966,MAY,2.0,22083.0,5.0
119996,19966,APRIL,2.0,15444.0,1000.0
119997,19966,MARCH,0.0,13695.0,5000.0
119998,19966,FEBRUARY,0.0,17598.0,1000.0


In [3]:
data.head()

Unnamed: 0,ID,MONTH,PAY,BILL_AMT,PAY_AMT
0,9910,JUNE,0.0,76885.0,3500.0
1,9910,MAY,0.0,79106.0,4000.0
2,9910,APRIL,0.0,81231.0,3000.0
3,9910,MARCH,0.0,81983.0,3146.0
4,9910,FEBRUARY,0.0,83773.0,3260.0


In [4]:
pay_date = pd.pivot_table(data, values=["PAY", "PAY_AMT", "BILL_AMT"], index=["ID"], columns=["MONTH"])
pay_date.columns = [f'{col[0]}_{col[1]}' for col in pay_date.columns]
pay_date = pay_date.reset_index()

pay_date

Unnamed: 0,ID,BILL_AMT_APRIL,BILL_AMT_FEBRUARY,BILL_AMT_JANUARY,BILL_AMT_JUNE,BILL_AMT_MARCH,BILL_AMT_MAY,PAY_APRIL,PAY_FEBRUARY,PAY_JANUARY,PAY_JUNE,PAY_MARCH,PAY_MAY,PAY_AMT_APRIL,PAY_AMT_FEBRUARY,PAY_AMT_JANUARY,PAY_AMT_JUNE,PAY_AMT_MARCH,PAY_AMT_MAY
0,0,13600.0,44024.0,18697.0,28991.0,0.0,24391.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,680.0,10000.0,1300.0,22373.0,1000.0
1,1,53169.0,50372.0,49470.0,75662.0,50875.0,70073.0,0.0,0.0,0.0,0.0,0.0,0.0,2000.0,1903.0,2006.0,3212.0,1603.0,2106.0
2,2,161487.0,168094.0,170922.0,155910.0,157577.0,158819.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,5500.0,1000.0,6800.0,13000.0,6500.0
3,3,0.0,0.0,0.0,23570.0,0.0,735.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,0.0,742.0,0.0,0.0
4,4,9044.0,9417.0,9617.0,8214.0,9225.0,8034.0,0.0,0.0,0.0,0.0,0.0,0.0,331.0,356.0,330.0,1140.0,341.0,1150.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,29993,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,29996,0.0,2580.0,6941.0,0.0,2580.0,0.0,-2.0,0.0,-1.0,1.0,-1.0,-2.0,2580.0,6941.0,0.0,0.0,0.0,0.0
19997,29997,46777.0,8824.0,9009.0,47194.0,39420.0,48381.0,0.0,0.0,0.0,0.0,0.0,0.0,1530.0,327.0,329.0,2002.0,1000.0,2000.0
19998,29998,29435.0,29434.0,16565.0,28409.0,30841.0,28530.0,0.0,0.0,0.0,0.0,0.0,0.0,2000.0,331.0,0.0,2000.0,1765.0,2000.0


In [5]:
customers = pd.read_csv("train_customers.csv")
customers

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE
0,9910,130000.0,2,2,1,27
1,15561,60000.0,2,3,1,48
2,23607,330000.0,2,1,2,44
3,6314,60000.0,2,2,2,24
4,27534,180000.0,2,1,2,33
...,...,...,...,...,...,...
19995,28636,330000.0,2,1,2,33
19996,17730,50000.0,1,2,2,49
19997,28030,410000.0,1,1,2,32
19998,15725,200000.0,1,1,1,40


In [6]:
merged = pd.merge(customers, pay_date, on="ID")
merged.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,BILL_AMT_APRIL,BILL_AMT_FEBRUARY,BILL_AMT_JANUARY,BILL_AMT_JUNE,...,PAY_JANUARY,PAY_JUNE,PAY_MARCH,PAY_MAY,PAY_AMT_APRIL,PAY_AMT_FEBRUARY,PAY_AMT_JANUARY,PAY_AMT_JUNE,PAY_AMT_MARCH,PAY_AMT_MAY
0,9910,130000.0,2,2,1,27,81231.0,83773.0,85532.0,76885.0,...,0.0,0.0,0.0,0.0,3000.0,3260.0,3200.0,3500.0,3146.0,4000.0
1,15561,60000.0,2,3,1,48,8422.0,3910.0,2431.0,4823.0,...,-1.0,-1.0,-1.0,-1.0,4377.0,2431.0,2120.0,5491.0,3918.0,9683.0
2,23607,330000.0,2,1,2,44,253863.0,262753.0,268145.0,243621.0,...,0.0,0.0,0.0,0.0,9400.0,9766.0,9786.0,21400.0,9542.0,0.0
3,6314,60000.0,2,2,2,24,15069.0,5879.0,-2879.0,29832.0,...,0.0,0.0,0.0,0.0,1000.0,1329.0,41378.0,1538.0,118.0,1308.0
4,27534,180000.0,2,1,2,33,178.0,0.0,1118.0,4500.0,...,-1.0,-2.0,-1.0,-2.0,1500.0,1118.0,1331.0,2580.0,0.0,178.0


In [7]:
train_target = pd.read_csv("train_target.csv")

In [8]:
merged_train = pd.merge(merged, train_target, on="ID")
merged_train.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,BILL_AMT_APRIL,BILL_AMT_FEBRUARY,BILL_AMT_JANUARY,BILL_AMT_JUNE,...,PAY_JUNE,PAY_MARCH,PAY_MAY,PAY_AMT_APRIL,PAY_AMT_FEBRUARY,PAY_AMT_JANUARY,PAY_AMT_JUNE,PAY_AMT_MARCH,PAY_AMT_MAY,DEFAULT_JULY
0,9910,130000.0,2,2,1,27,81231.0,83773.0,85532.0,76885.0,...,0.0,0.0,0.0,3000.0,3260.0,3200.0,3500.0,3146.0,4000.0,0
1,15561,60000.0,2,3,1,48,8422.0,3910.0,2431.0,4823.0,...,-1.0,-1.0,-1.0,4377.0,2431.0,2120.0,5491.0,3918.0,9683.0,1
2,23607,330000.0,2,1,2,44,253863.0,262753.0,268145.0,243621.0,...,0.0,0.0,0.0,9400.0,9766.0,9786.0,21400.0,9542.0,0.0,1
3,6314,60000.0,2,2,2,24,15069.0,5879.0,-2879.0,29832.0,...,0.0,0.0,0.0,1000.0,1329.0,41378.0,1538.0,118.0,1308.0,0
4,27534,180000.0,2,1,2,33,178.0,0.0,1118.0,4500.0,...,-2.0,-1.0,-2.0,1500.0,1118.0,1331.0,2580.0,0.0,178.0,0


In [9]:
merged_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 20000 non-null  int64  
 1   LIMIT_BAL          20000 non-null  float64
 2   SEX                20000 non-null  int64  
 3   EDUCATION          20000 non-null  int64  
 4   MARRIAGE           20000 non-null  int64  
 5   AGE                20000 non-null  int64  
 6   BILL_AMT_APRIL     20000 non-null  float64
 7   BILL_AMT_FEBRUARY  20000 non-null  float64
 8   BILL_AMT_JANUARY   20000 non-null  float64
 9   BILL_AMT_JUNE      20000 non-null  float64
 10  BILL_AMT_MARCH     20000 non-null  float64
 11  BILL_AMT_MAY       20000 non-null  float64
 12  PAY_APRIL          20000 non-null  float64
 13  PAY_FEBRUARY       20000 non-null  float64
 14  PAY_JANUARY        20000 non-null  float64
 15  PAY_JUNE           20000 non-null  float64
 16  PAY_MARCH          200

In [10]:
merged_train.to_csv("Johnny_Naime_A_train.csv", index=False)

## EXPLORATORY DATA ANALYSIS

In [11]:
filtered = merged_train[merged_train["DEFAULT_JULY"] == 1][["EDUCATION", "DEFAULT_JULY"]]
filtered["EDUCATION"] = filtered["EDUCATION"].apply(lambda x: "Graduate" if x == 1 else "University" if x == 2 else "High School" if x == 3 else "Others" if x == 4 else "Unknown")
filtered["DEFAULT_COUNT"] = merged_train[merged_train["DEFAULT_JULY"] == 1].shape[1]
filtered = filtered.groupby("EDUCATION").size().reset_index(name="DEFAULT_COUNT").sort_values(by="DEFAULT_COUNT", ascending=False)

fig = px.bar(filtered, x="EDUCATION", y="DEFAULT_COUNT", title="Education vs Default July", color="DEFAULT_COUNT", text="DEFAULT_COUNT", color_continuous_scale= px.colors.sequential.GnBu)
fig.update_layout(xaxis_title="Education Level", yaxis_title="Defaults in July", coloraxis_showscale=False)

As shown in the graph above, the most group that defaulted is university graduates, next comes graduate school graduates, followed by High School graduates. This trend can be explained by a variety of variables, including the sorts of occupations available to people with various degrees of education, the amount of student loan debt accumulated by people with various levels of education, or other economic challenges.

In [12]:
filtered = merged_train[["DEFAULT_JULY", "LIMIT_BAL"]].sort_values(by = "LIMIT_BAL", ascending=False)
filtered["DEFAULT_JULY"] = filtered["DEFAULT_JULY"].apply(lambda x: "Default" if x == 1 else "No Default")
fig = px.box(filtered, x="DEFAULT_JULY", y="LIMIT_BAL", title="Default July vs Limit Balance", color="DEFAULT_JULY")
fig.update_layout(xaxis_title="Default July", yaxis_title="Limit Balance", showlegend=False)

Above, we have the limit balance and the defaults in July. The median balance limit for not defaulting is 150K, whereas for defaulting the median is 90K. This suggests that individuals with higher credit limit balances are less likely to default on their payments compared to those with lower credit limit balances.

In [13]:
filtered = merged_train[["DEFAULT_JULY", "AGE"]].sort_values(by = "AGE", ascending=False)
filtered["DEFAULT_JULY"] = filtered["DEFAULT_JULY"].apply(lambda x: "Default" if x == 1 else "No Default")
fig = px.box(filtered, x="DEFAULT_JULY", y="AGE", title="Default July vs Age", color="DEFAULT_JULY")
fig.update_layout(xaxis_title="Default July", yaxis_title="Age", showlegend=False)

In [14]:
filtered = merged_train.loc[:, ["DEFAULT_JULY", "MARRIAGE"]]
filtered["MARRIAGE"] = filtered["MARRIAGE"].apply(lambda x: "Married" if x == 1 else "Single" if x == 2 else "Others" if x == 0 or 3 else "Unknown")
filtered["DEFAULT_JULY"] = filtered["DEFAULT_JULY"].apply(lambda x: "Default" if x == 1 else "No Default")
fig = px.histogram(filtered, x="MARRIAGE", color="DEFAULT_JULY", title="Default July vs Marriage", barmode="group")
fig.update_layout(xaxis_title="Marital Status", yaxis_title="Default in July", showlegend=True)

In [15]:
filtered = merged_train.loc[:, ["SEX", "DEFAULT_JULY"]]
filtered["DEFAULT_JULY"] = filtered["DEFAULT_JULY"].apply(lambda x: "Default" if x == 1 else "No Default")
filtered["SEX"] = filtered["SEX"].apply(lambda x: "Male" if x == 1 else "Female")

fig = px.histogram(filtered, x="SEX", color = "DEFAULT_JULY", title="Default July vs Sex", barmode = "group")
fig.update_layout(xaxis_title = "Sex", yaxis_title = "Default in July", showlegend = True)


I plotted 3 extra graphs to provide insights on the data:

 - Default July Vs. Age: The median age for defaulting and not defaulting are both 34 years old.

 - Default July Vs. Marital Status: Married couples have a default rate of 23.54% whereas Single people had a default rate of 20.77%.
 
 - Default July Vs. Sex: Females have a default rate of 20.88% whereas Males have a default rate of 23.97%


## MACHINE LEARNING 

In [16]:
test_data = pd.read_csv("test_data.csv")
test_data = test_data[merged_train.columns[:]]

full_train = pd.concat([merged_train, test_data])


In [17]:
full_train.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE',
       'BILL_AMT_APRIL', 'BILL_AMT_FEBRUARY', 'BILL_AMT_JANUARY',
       'BILL_AMT_JUNE', 'BILL_AMT_MARCH', 'BILL_AMT_MAY', 'PAY_APRIL',
       'PAY_FEBRUARY', 'PAY_JANUARY', 'PAY_JUNE', 'PAY_MARCH', 'PAY_MAY',
       'PAY_AMT_APRIL', 'PAY_AMT_FEBRUARY', 'PAY_AMT_JANUARY', 'PAY_AMT_JUNE',
       'PAY_AMT_MARCH', 'PAY_AMT_MAY', 'DEFAULT_JULY'],
      dtype='object')

In [18]:
X = full_train.drop(columns=["ID", "DEFAULT_JULY"])
y = full_train["DEFAULT_JULY"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((20300, 23), (8700, 23), (20300,), (8700,))

**Logistic Regression Dummy model**

In [57]:
lr = LogisticRegression(max_iter=500)

In [58]:
sc = StandardScaler()
num_cols = ['BILL_AMT_APRIL', 'BILL_AMT_FEBRUARY', 'BILL_AMT_JANUARY',
       'BILL_AMT_JUNE', 'BILL_AMT_MARCH', 'BILL_AMT_MAY', 'PAY_APRIL',
       'PAY_FEBRUARY', 'PAY_JANUARY', 'PAY_JUNE', 'PAY_MARCH', 'PAY_MAY',
       'PAY_AMT_APRIL', 'PAY_AMT_FEBRUARY', 'PAY_AMT_JANUARY', 'PAY_AMT_JUNE',
       'PAY_AMT_MARCH', 'PAY_AMT_MAY', 'LIMIT_BAL']
X_train_sc = X_train
X_val_sc = X_val
X_train_sc[num_cols]= sc.fit_transform(X_train_sc[num_cols])
X_val_sc[num_cols] = sc.transform(X_val_sc[num_cols])

In [59]:
lr.fit(X_train_sc, y_train)

LogisticRegression(max_iter=500)

In [60]:
y_pred = lr.predict_proba(X_val_sc)[:,1]

In [61]:
roc_auc_score(y_val, y_pred)

0.7298640351617119

**XGBoost Dummy model**

In [24]:
xgb = XGBClassifier()
xgb_pred = xgb.fit(X_train, y_train)
y_pred = xgb.predict_proba(X_val)[:,1]
roc_auc_score(y_val, y_pred)

0.762115273736938

**Random Forest regressor Dummy Model**

In [63]:
rf = RandomForestRegressor()
rf_pred = rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
roc_auc_score(y_val, y_pred)

0.7608059156952812

**Defining Stratified kFold Strategy**

In [21]:
skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=42)
y_stratified = pd.cut(y_train.rank(method='first'), bins=10, labels=False)
cv  = list(skf.split(X_train, y_stratified))

**Oversampling data for imbalanced classes**

In [30]:
params = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.039],
    'subsample': [0.9],
    'colsample_bytree': [0.9],
    #'min_child_weight': [1, 2, 3],
    'objective': ['binary:logistic'],
    #'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
    #'reg_lambda': [0, 0.25, 0.5, 0.75, 1],
    #'gamma': [0, 0.1, 0.25],
    #'scale_pos_weight': [1, 2],
}


bayes = BayesSearchCV(xgb, params, scoring='roc_auc', cv=cv, n_iter=10, n_jobs=-1, verbose=1, random_state=42)

In [31]:
bayes.fit(X_train, y_train, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_val, y_val)])

Fitting 50 folds for each of 1 candidates, totalling 50 fits
Fitting 50 folds for each of 1 candidates, totalling 50 fits
Fitting 50 folds for each of 1 candidates, totalling 50 fits
Fitting 50 folds for each of 1 candidates, totalling 50 fits
Fitting 50 folds for each of 1 candidates, totalling 50 fits
Fitting 50 folds for each of 1 candidates, totalling 50 fits
Fitting 50 folds for each of 1 candidates, totalling 50 fits
Fitting 50 folds for each of 1 candidates, totalling 50 fits
Fitting 50 folds for each of 1 candidates, totalling 50 fits
Fitting 50 folds for each of 1 candidates, totalling 50 fits
[0]	validation_0-auc:0.76036
[1]	validation_0-auc:0.76885
[2]	validation_0-auc:0.76935
[3]	validation_0-auc:0.76958
[4]	validation_0-auc:0.77025
[5]	validation_0-auc:0.77178
[6]	validation_0-auc:0.77175
[7]	validation_0-auc:0.77205



`eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.


`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[8]	validation_0-auc:0.77198
[9]	validation_0-auc:0.77256
[10]	validation_0-auc:0.77279
[11]	validation_0-auc:0.77299
[12]	validation_0-auc:0.77334
[13]	validation_0-auc:0.77319
[14]	validation_0-auc:0.77409
[15]	validation_0-auc:0.77506
[16]	validation_0-auc:0.77512
[17]	validation_0-auc:0.77542
[18]	validation_0-auc:0.77561
[19]	validation_0-auc:0.77549
[20]	validation_0-auc:0.77561
[21]	validation_0-auc:0.77548
[22]	validation_0-auc:0.77570
[23]	validation_0-auc:0.77602
[24]	validation_0-auc:0.77627
[25]	validation_0-auc:0.77669
[26]	validation_0-auc:0.77729
[27]	validation_0-auc:0.77734
[28]	validation_0-auc:0.77726
[29]	validation_0-auc:0.77788
[30]	validation_0-auc:0.77807
[31]	validation_0-auc:0.77844
[32]	validation_0-auc:0.77888
[33]	validation_0-auc:0.77934
[34]	validation_0-auc:0.77924
[35]	validation_0-auc:0.77935
[36]	validation_0-auc:0.77948
[37]	validation_0-auc:0.77952
[38]	validation_0-auc:0.77975
[39]	validation_0-auc:0.78007
[40]	validation_0-auc:0.77996
[41]	validat

BayesSearchCV(cv=[(array([    0,     1,     2, ..., 20297, 20298, 20299]),
                   array([   22,    41,    47, ..., 20276, 20284, 20289])),
                  (array([    0,     1,     2, ..., 20297, 20298, 20299]),
                   array([    3,    31,    45, ..., 20267, 20271, 20286])),
                  (array([    0,     1,     2, ..., 20297, 20298, 20299]),
                   array([    6,    10,    27, ..., 20252, 20260, 20288])),
                  (array([    0,     1,     2, ..., 20297, 20298, 20299]),
                   array([   14,    48,    66, ..., 20268, 20272, 20283])),
                  (array([    0,     1,     2, ...,...
                                      missing=nan, monotone_constraints=None,
                                      n_estimators=100, n_jobs=None,
                                      num_parallel_tree=None, predictor=None,
                                      random_state=None, ...),
              n_iter=10, n_jobs=-1, random_state=42, 

In [73]:
est = bayes.best_estimator_
print(bayes.best_params_)
y_pred = est.predict_proba(X_val)[:,1]
y_pred_acc = est.predict(X_val)
roc_auc_score(y_val, y_pred)

OrderedDict([('colsample_bytree', 0.9), ('learning_rate', 0.039), ('max_depth', 5), ('n_estimators', 300), ('objective', 'binary:logistic'), ('subsample', 0.9)])


0.7818236981553567

In [64]:
bayes.best_score_

0.7819768454822354

In [28]:
cv_scores = cross_val_score(est, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
fig = px.line(x=list(range(1, len(cv_scores)+1)), y=cv_scores, title="Cross Validation Scores")
fig.update_layout(xaxis_title="Fold", yaxis_title="ROC AUC Score", showlegend=False)



In [71]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
def threshold_search(y_true, y_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001)
    roc_auc = roc_auc_score(y_true, y_proba)
    best_score = 0
    best_th = 0
    for th in thresholds:
        y_pred = (y_proba >= th).astype(int)
        score = roc_auc_score(y_true, y_pred)
        if score > best_score:
            best_score = score
            best_th = th
    search_result = {'threshold': best_th , 'roc_auc': best_score}
    return search_result


In [70]:
fpr, tpr, thresholds = roc_curve(y_val, y_pred)
thresholds[0]
roc_auc_score(y_val, y_pred)

1.8564765

In [72]:
threshold_search(y_val, y_pred)

{'threshold': 0.23334084451198578, 'roc_auc': 0.7202021522098093}

In [None]:
params = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [5, 10, 15],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
}

bayes_rf = BayesSearchCV(rf, params, scoring='roc_auc', cv=cv, n_iter=10, n_jobs=-1, verbose=1, random_state=42)

In [None]:
bayes_rf.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


BayesSearchCV(cv=[(array([    0,     1,     2, ..., 20297, 20298, 20299]),
                   array([   22,    41,    47, ..., 20276, 20284, 20289])),
                  (array([    0,     1,     2, ..., 20297, 20298, 20299]),
                   array([    3,    31,    45, ..., 20267, 20271, 20286])),
                  (array([    0,     1,     2, ..., 20297, 20298, 20299]),
                   array([    6,    10,    27, ..., 20252, 20260, 20288])),
                  (array([    0,     1,     2, ..., 20297, 20298, 20299]),
                   array([   14,    48,    66, ..., 20268, 20272, 20283])),
                  (array([    0,     1,     2, ...,...
                  (array([    0,     1,     2, ..., 20297, 20298, 20299]),
                   array([    4,    13,    16, ..., 20232, 20237, 20282]))],
              estimator=RandomForestRegressor(), n_iter=10, n_jobs=-1,
              random_state=42, scoring='roc_auc',
              search_spaces={'bootstrap': [True, False],
           

In [None]:
est_rf = bayes_rf.best_estimator_
print(bayes_rf.best_params_)
y_pred = est_rf.predict(X_val)
roc_auc_score(y_val, y_pred)

OrderedDict([('bootstrap', True), ('max_depth', 10), ('max_features', 'sqrt'), ('min_samples_leaf', 2), ('min_samples_split', 5), ('n_estimators', 200)])


0.7812558325688891

**Chosen Model, XGBoost**

In [74]:
submission = pd.read_csv("submission_features.csv")
sub_X = submission.drop(columns=["ID"])
sub_X = sub_X[X_train.columns[:]]


In [75]:
full_fit_X = pd.concat([X_train, X_val])
full_fit_y = pd.concat([y_train, y_val])

est.fit(full_fit_X, full_fit_y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.039, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=300, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [76]:
sub_pred = est.predict_proba(sub_X)[:,1]

In [77]:
preds_sub = pd.DataFrame(sub_pred, columns=["DEFAULT_JULY"])
preds_sub["ID"] = submission["ID"]
preds_sub = preds_sub[["ID", "DEFAULT_JULY"]]
preds_sub.head()

Unnamed: 0,ID,DEFAULT_JULY
0,2774,0.124091
1,15339,0.382245
2,26485,0.360445
3,6657,0.07372
4,7917,0.092362


In [81]:
preds_sub.to_csv("Naime_Johnny_C1_submission.csv", index=False)

In [78]:
preds_sub.sort_values(by="DEFAULT_JULY", ascending=True, ignore_index=True).head

<bound method NDFrame.head of         ID  DEFAULT_JULY
0    17142      0.015306
1     5735      0.016786
2    24657      0.018890
3     7890      0.019226
4     9331      0.019452
..     ...           ...
995   6300      0.886430
996  18310      0.894601
997    184      0.895973
998  18535      0.905958
999   5351      0.912757

[1000 rows x 2 columns]>

In [79]:
preds_sub = pd.DataFrame(sub_pred, columns=["DEFAULT_JULY"])
preds_sub["ID"] = submission["ID"]
preds_sub = preds_sub[["ID", "DEFAULT_JULY"]]
preds_sub.head()

Unnamed: 0,ID,DEFAULT_JULY
0,2774,0.124091
1,15339,0.382245
2,26485,0.360445
3,6657,0.07372
4,7917,0.092362


In [80]:
fpr, tpr, thresholds = roc_curve(y_val, y_pred)
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))

Best Threshold=0.233341


In [81]:
preds_sub["DEFAULT_JULY_01"] = preds_sub["DEFAULT_JULY"].apply(lambda x: 1 if x > best_thresh else 0)


In [82]:
preds_sub.value_counts("DEFAULT_JULY_01")

DEFAULT_JULY_01
0    708
1    292
dtype: int64

In [83]:
nondefault = preds_sub[preds_sub["DEFAULT_JULY_01"] == 0].count()[0]
default = preds_sub[preds_sub["DEFAULT_JULY_01"] == 1].count()[0]
print("Non-default: ", nondefault)
print("Default: ", default)

cost = 5000
benefit = 10000

profit = (benefit * nondefault) - (cost * default)
print("Profit: ", profit)

Non-default:  708
Default:  292
Profit:  5620000


In [122]:
confusion_matrix(y_val, y_pred_acc)

array([[6462,  301],
       [1332,  605]], dtype=int64)