In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

### Main dataset

* Prepare data

In [8]:
app_train_poly = pd.read_csv('../data/cleanedData/cleaned_df.csv')

X = app_train_poly.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y = app_train_poly['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

* Random forest algorithm

In [9]:
forest = RandomForestClassifier(class_weight='balanced', random_state=42)
forest.fit(X_train, y_train)
y_pred_rf = forest.predict(X_test)

y_proba_train_rf = forest.predict_proba(X_train)
y_proba_test_rf = forest.predict_proba(X_test)

print(f"Roc score for train: {round(roc_auc_score(y_train, y_proba_train_rf[:,1]),4)*100}%")
print(f"Roc score for test: {round(roc_auc_score(y_test, y_proba_test_rf[:,1]),4)*100}%")
#pd.Series(forest.feature_importances_, index=X_train.columns)[:20].sort_values().plot(kind='barh');

Roc score for train: 100.0%
Roc score for test: 72.28999999999999%


* XGBoost algorithm

In [10]:
xgb = XGBClassifier(n_estimators=2, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

y_proba_train_xgb = xgb.predict_proba(X_train)
y_proba_test_xgb = xgb.predict_proba(X_test)

print(f"Roc score for train: {round(roc_auc_score(y_train, y_proba_train_xgb[:,1]),4)*100}%")
print(f"Roc score for test: {round(roc_auc_score(y_test, y_proba_test_xgb[:,1]),4)*100}%")
#pd.Series(xgb.feature_importances_, index=X_train.columns)[:20].sort_values().plot(kind='barh');

Roc score for train: 73.18%
Roc score for test: 72.47%


* XGBoost optimization

In [11]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'seed': 2018,
    'eval_metric': 'merror',
    'objective': 'multi:softprob',
    'num_class': 2
}

xgb_after_opt = XGBClassifier(random_state=42, **xgb_params)
xgb_after_opt.fit(X_train, y_train)

y_proba_train_xgb_opt = xgb_after_opt.predict_proba(X_train)
y_proba_test_xgb_opt = xgb_after_opt.predict_proba(X_test)

print(f"Roc score for train: {round(roc_auc_score(y_train, y_proba_train_xgb_opt[:, 1]), 4) * 100}%")
print(f"Roc score for test: {round(roc_auc_score(y_test, y_proba_test_xgb_opt[:, 1]), 4) * 100}%")

#pd.Series(xgb_after_opt.feature_importances_, index=X_train.columns)[:20].sort_values().plot(kind='barh')
#plt.show()

Roc score for train: 75.27000000000001%
Roc score for test: 74.94%


* LGBM algorithm

In [27]:
lgb_clf = LGBMClassifier(objective='binary', random_state=42)
# Replace spaces with underscores and remove non-alphanumeric characters
X_train.columns = ["".join(c if c.isalnum() else "_" for c in str(col)) for col in X_train.columns]
lgb_clf.fit(X_train, y_train)
y_pred_lgb = lgb_clf.predict(X_test)

y_proba_train_lgb = lgb_clf.predict_proba(X_train)
y_proba_test_lgb = lgb_clf.predict_proba(X_test)

print('\n')
print(f"Roc score for train: {round(roc_auc_score(y_train, y_proba_train_lgb[:,1]),4)*100}%")
print(f"Roc score for test: {round(roc_auc_score(y_test, y_proba_test_lgb[:,1]),4)*100}%")
#pd.Series(lgb_clf.feature_importances_, index=X_train.columns)[:20].sort_values().plot(kind='barh');

[LightGBM] [Info] Number of positive: 1493, number of negative: 12828
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004467 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13474
[LightGBM] [Info] Number of data points in the train set: 14321, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.104252 -> initscore=-2.150843
[LightGBM] [Info] Start training from score -2.150843


Roc score for train: 99.15%
Roc score for test: 74.24%


In [17]:
# Calculate the scores
random_forest_train_score = round(roc_auc_score(y_train, y_proba_train_rf[:, 1]), 4) * 100
random_forest_test_score = round(roc_auc_score(y_test, y_proba_test_rf[:, 1]), 4) * 100
xgboost_train_score = round(roc_auc_score(y_train, y_proba_train_xgb[:, 1]), 4) * 100
xgboost_test_score = round(roc_auc_score(y_test, y_proba_test_xgb[:, 1]), 4) * 100
lgbm_train_score = round(roc_auc_score(y_train, y_proba_train_lgb[:, 1]), 4) * 100
lgbm_test_score = round(roc_auc_score(y_test, y_proba_test_lgb[:, 1]), 4) * 100

# Data for the table
table_data = [
    ["Random forest", random_forest_train_score, random_forest_test_score],
    ["XGBoost", xgboost_train_score, xgboost_test_score],
    ["LGBM", lgbm_train_score, lgbm_test_score]
]

# Table headers
headers = ["Algorithm", "Roc Train Score %", "Roc Test Score %"]

print(tabulate(table_data, headers, tablefmt="grid"))

+---------------+---------------------+--------------------+
| Algorithm     |   Roc Train Score % |   Roc Test Score % |
| Random forest |              100    |              72.29 |
+---------------+---------------------+--------------------+
| XGBoost       |               73.18 |              72.47 |
+---------------+---------------------+--------------------+
| LGBM          |               79.25 |              75.38 |
+---------------+---------------------+--------------------+


### All datasets

* Prepare data

In [19]:
all_df = pd.read_csv('../data/cleanedData/all_df.csv')

X = all_df.drop(['SK_ID_CURR','TARGET'], axis=1)
y = all_df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

* Random forest algorithm

In [20]:
forest = RandomForestClassifier(class_weight='balanced', random_state=42)
forest.fit(X_train, y_train)
y_pred_rf = forest.predict(X_test)

y_proba_train_rf = forest.predict_proba(X_train)
y_proba_test_rf = forest.predict_proba(X_test)

print(f"Roc score for train: {round(roc_auc_score(y_train, y_proba_train_rf[:,1]),4)*100}%")
print(f"Roc score for test: {round(roc_auc_score(y_test, y_proba_test_rf[:,1]),4)*100}%")
#pd.Series(forest.feature_importances_, index=X_train.columns)[:20].sort_values().plot(kind='barh');

Roc score for train: 100.0%
Roc score for test: 72.02%


* XGBoost algorithm

In [21]:
xgb = XGBClassifier(n_estimators=2, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

y_proba_train_xgb = xgb.predict_proba(X_train)
y_proba_test_xgb = xgb.predict_proba(X_test)

print(f"Roc score for train: {round(roc_auc_score(y_train, y_proba_train_xgb[:,1]),4)*100}%")
print(f"Roc score for test: {round(roc_auc_score(y_test, y_proba_test_xgb[:,1]),4)*100}%")
#pd.Series(xgb.feature_importances_, index=X_train.columns)[:20].sort_values().plot(kind='barh');

Roc score for train: 75.53%
Roc score for test: 69.87%


* Optimization (XGBoost)

In [None]:
xgb_params = {
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'seed': 2018,
    'eval_metric': 'merror',
    'objective': 'multi:softprob',
    'num_class': 2
}

max_depths = [2, 3, 4, 5]

for depth in max_depths:
    xgb_opt = XGBClassifier(max_depth=depth, **xgb_params)
    xgb_opt.fit(X_train, y_train)

    y_proba_train = xgb_opt.predict_proba(X_train)
    y_proba_test = xgb_opt.predict_proba(X_test)

    print(roc_auc_score(y_train, y_proba_train[:,1]))
    print(roc_auc_score(y_test, y_proba_test[:,1]))
    print()


0.7727788665993742
0.745416953594566

0.8268888008920539
0.7516321461761146

0.8932810552769802
0.7504822060131174

0.9539035559562753
0.746567729469891



In [None]:
xgb_params = {
    'max_depth': 4,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'seed': 2018,
    'eval_metric': 'merror',
    'objective': 'multi:softprob',
    'num_class': 2
}
etas = [0.08, 0.1, 0.12]

for eta in etas:
    xgb_opt = XGBClassifier(eta = eta, **xgb_params)
    xgb_opt.fit(X_train, y_train)

    y_proba_train = xgb_opt.predict_proba(X_train)
    y_proba_test = xgb_opt.predict_proba(X_test)

    print(roc_auc_score(y_train, y_proba_train[:,1]))
    print(roc_auc_score(y_test, y_proba_test[:,1]))
    print()


0.8725430503977506
0.7520809236961216

0.8932810552769802
0.7504822060131174

0.9099287737327777
0.7513154112038938



In [22]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'seed': 2018,
    'eval_metric': 'merror',
    'objective': 'multi:softprob',
    'num_class': 2
}

xgb_after_opt = XGBClassifier(random_state=42, **xgb_params)
xgb_after_opt.fit(X_train, y_train)

y_proba_train_xgb_opt = xgb_after_opt.predict_proba(X_train)
y_proba_test_xgb_opt = xgb_after_opt.predict_proba(X_test)

print(f"Roc score for train: {round(roc_auc_score(y_train, y_proba_train_xgb_opt[:, 1]), 4) * 100}%")
print(f"Roc score for test: {round(roc_auc_score(y_test, y_proba_test_xgb_opt[:, 1]), 4) * 100}%")

#pd.Series(xgb_after_opt.feature_importances_, index=X_train.columns)[:20].sort_values().plot(kind='barh')
#plt.show()

Roc score for train: 83.02000000000001%
Roc score for test: 75.79%


* LGBM algorithm

In [26]:
lgb_clf = LGBMClassifier(objective='binary', random_state=42)
X_train.columns = ["".join(c if c.isalnum() else "_" for c in str(col)) for col in X_train.columns]
lgb_clf.fit(X_train, y_train)
y_pred_lgb = lgb_clf.predict(X_test)

y_proba_train_lgb = lgb_clf.predict_proba(X_train)
y_proba_test_lgb = lgb_clf.predict_proba(X_test)

print('\n')
print(f"Roc score for train: {round(roc_auc_score(y_train, y_proba_train_lgb[:,1]),4)*100}%")
print(f"Roc score for test: {round(roc_auc_score(y_test, y_proba_test_lgb[:,1]),4)*100}%")
#pd.Series(lgb_clf.feature_importances_, index=X_train.columns)[:20].sort_values().plot(kind='barh');

[LightGBM] [Info] Number of positive: 1493, number of negative: 12828
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012957 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13474
[LightGBM] [Info] Number of data points in the train set: 14321, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.104252 -> initscore=-2.150843
[LightGBM] [Info] Start training from score -2.150843


Roc score for train: 99.15%
Roc score for test: 74.24%


In [25]:
random_forest_train_score = round(roc_auc_score(y_train, y_proba_train_rf[:, 1]), 4) * 100
random_forest_test_score = round(roc_auc_score(y_test, y_proba_test_rf[:, 1]), 4) * 100
xgboost_train_score = round(roc_auc_score(y_train, y_proba_train_xgb_opt[:, 1]), 4) * 100
xgboost_test_score = round(roc_auc_score(y_test, y_proba_test_xgb_opt[:, 1]), 4) * 100
lgbm_train_score = round(roc_auc_score(y_train, y_proba_train_lgb[:, 1]), 4) * 100
lgbm_test_score = round(roc_auc_score(y_test, y_proba_test_lgb[:, 1]), 4) * 100

table_data = [
    ["Random forest", random_forest_train_score, random_forest_test_score],
    ["XGBoost", xgboost_train_score, xgboost_test_score],
    ["LGBM", lgbm_train_score, lgbm_test_score]
]

headers = ["Algorithm", "Roc Train Score %", "Roc Test Score %"]

print(tabulate(table_data, headers, tablefmt="grid"))

+---------------+---------------------+--------------------+
| Algorithm     |   Roc Train Score % |   Roc Test Score % |
| Random forest |              100    |              72.02 |
+---------------+---------------------+--------------------+
| XGBoost       |               83.02 |              75.79 |
+---------------+---------------------+--------------------+
| LGBM          |               99.15 |              74.24 |
+---------------+---------------------+--------------------+
