In [10]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn import ensemble
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
import xgboost as xgb
import lightgbm as lgb
import numpy as np
from custom_xgb.custom_xgb import CustomXGBoost
from utils import evaluate

In [11]:
TRAIN_PATH = 'data/train.csv'
TEST_PATH = 'data/test.csv'

In [12]:
np.random.seed(0)

In [13]:
df = pd.read_csv(TRAIN_PATH)

cat_cols = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']
num_cols = df.columns.difference(cat_cols)

X_train, y_train = df.iloc[:,:-1], df.iloc[:,-1]

print(y_train[y_train==1].count())
print(y_train[y_train==0].count())
X_train

7108
25891


Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32994,35.0,male,High School,87428.0,10,MORTGAGE,12888.0,MEDICAL,6.47,0.15,7.0,664,Yes
32995,26.0,female,Bachelor,91318.0,6,MORTGAGE,8000.0,VENTURE,14.92,0.09,5.0,590,Yes
32996,23.0,female,Bachelor,79749.0,0,RENT,16800.0,PERSONAL,11.28,0.21,2.0,632,No
32997,25.0,male,High School,51450.0,1,RENT,6000.0,MEDICAL,12.48,0.12,3.0,661,No


In [14]:
ohe = OneHotEncoder(drop='first')
transformer = Pipeline([
    ('ohe',ColumnTransformer(
        [('cat_trans', ohe, cat_cols)], 
        remainder='passthrough'
    )),
    ('scaler',StandardScaler()),
])

transformer.fit(X_train)
X_train = transformer.transform(X_train)

In [15]:
df = pd.read_csv(TEST_PATH)

X_test, y_test = transformer.transform(df.iloc[:,:-1]), df.iloc[:,-1]

# Training & Hyper-parameter Tuning

In [16]:
params = {
    'lr': [0.3],
    'n_estimators': [50],
    'reg_lambda': [1.0, 1.5],
    'row_subsample_ratio': [0.8],
    'max_depth': [4],
}
kf = StratifiedKFold(n_splits=2)
custom_xgb = GridSearchCV(
    CustomXGBoost(lr=0.3, n_estimators = 50, max_depth=5, row_subsample_ratio=0.5, reg_lambda=1.5),
    param_grid=params, 
    cv=kf, 
    scoring='f1',
    ).fit(X_train, y_train)
print('Best parameters:', custom_xgb.best_params_)
print('Best score:', custom_xgb.best_score_)
# ~ 3mins

Best parameters: {'lr': 0.3, 'max_depth': 4, 'n_estimators': 50, 'reg_lambda': 1.5, 'row_subsample_ratio': 0.8}
Best score: 0.7930617116514047


In [None]:
params = {
    'learning_rate': [0.1],
    'n_estimators': [200],
    'reg_lambda': [0, 0.1, 1],
    'row_subsample_ratio': [0.5, 0.8, 1.0],
    'num_leaves': [20,31],
}
kf = StratifiedKFold(n_splits=3)
lgbm = GridSearchCV(
    lgb.LGBMClassifier(**params, verbosity=-1),
    param_grid=params, 
    cv=kf, 
    scoring='f1',
    n_jobs=2,
    ).fit(X_train, y_train)
print('Best parameters:', lgbm.best_params_)
print('Best score:', lgbm.best_score_)
# ~ 30s

Best parameters: {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31, 'reg_lambda': 0, 'row_subsample_ratio': 0.8}
Best score: 0.7712581670666765


In [None]:
params = {
    'learning_rate': [0.1],
    'n_estimators': [200],
    'reg_lambda': [0.5, 1.0, 1.5],
    'subsample': [0.5, 0.8, 1.0],
    'max_depth': [3, 4, 5],
}
kf = StratifiedKFold(n_splits=3)
std_xgb = GridSearchCV(
    xgb.XGBClassifier(
        tree_method='hist',
        objective='binary:logistic',
        # class_weights={1:7108./32999, 0:25891./32999}
    ),
    param_grid=params, 
    cv=kf, 
    scoring='f1',
    n_jobs=2,
    ).fit(X_train, y_train)
print('Best parameters:', std_xgb.best_params_)
print('Best score:', std_xgb.best_score_)
# ~ 30s

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'reg_lambda': 1.0, 'subsample': 0.8}
Best score: 0.7759916833634044


In [19]:
# params = {
#     'learning_rate': [0.5, 1, 1.5],
#     'n_estimators': [100],
# }
# kf = StratifiedKFold(n_splits=3)
# ada = GridSearchCV(
#     ensemble.AdaBoostClassifier(algorithm='SAMME'),
#     param_grid=params,
#     cv=kf, 
#     scoring='accuracy',
#     n_jobs=2,
#     ).fit(X_train, y_train)
# print('Best parameters:', ada.best_params_)
# print('Best score:', ada.best_score_)
# ~ 30s

In [20]:
# rf = ensemble.RandomForestClassifier(n_estimators=100, criterion='entropy')
# rf = rf.fit(X_train, y_train)

In [22]:
log_reg = LogisticRegressionCV(scoring='f1')
log_reg = log_reg.fit(X_train, y_train)
log_reg.get_params()

{'Cs': 10,
 'class_weight': None,
 'cv': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1.0,
 'l1_ratios': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'refit': True,
 'scoring': 'f1',
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0}

# Training Meta Classifier

In [23]:
meta_classifier = ensemble.StackingClassifier([
    ('lgbm', lgbm),
    ('std_xgb', std_xgb),
    ('custom_xgb', custom_xgb),
    # ('rf', rf),
    # ('ada', ada),
    ('logreg', log_reg),
], cv='prefit')

meta_classifier.fit(X_train, y_train)

# Results

## Individual Models

### Custom XGBoost

In [None]:
print('Train metrics:')
evaluate(custom_xgb, X_train, y_train)
print('\n')
print('Test metrics:')
evaluate(custom_xgb, X_test, y_test)
1

Train metrics:
Accuracy: 0.923088578441771
F1: 0.8108510955433
Precision: 0.8621236133122029
Recall: 0.7653348339898706


Test metrics:
Accuracy: 0.923
F1: 0.828125
Precision: 0.8961352657004831
Recall: 0.7697095435684648


(0.923, 0.828125, 0.8961352657004831, 0.828125)

### XGBoost (From Library)

In [None]:
print('Train metrics:')
evaluate(std_xgb, X_train, y_train)
print('\n')
print('Test metrics:')
evaluate(std_xgb, X_test, y_test)
1

Train metrics:
Accuracy: 0.9305130458498743
F1: 0.828893366166704
Precision: 0.8825679326235499
Recall: 0.7813731007315701


Test metrics:
Accuracy: 0.9283333333333333
F1: 0.8396120850428944
Precision: 0.911336032388664
Recall: 0.7783540802213001


(0.9283333333333333, 0.8396120850428944, 0.911336032388664, 0.8396120850428944)

### LightGBM

In [None]:
print('Train metrics:')
evaluate(lgbm, X_train, y_train)
print('\n')
print('Test metrics:')
evaluate(lgbm, X_test, y_test)
1

Train metrics:
Accuracy: 0.9482105518349041
F1: 0.8716292345827387
Precision: 0.9350523771152297
Recall: 0.8162633652222847


Test metrics:
Accuracy: 0.9330833333333334
F1: 0.8510480430346874
Precision: 0.9179671868747499
Recall: 0.793222683264177


(0.9330833333333334,
 0.8510480430346874,
 0.9179671868747499,
 0.8510480430346874)

### Logistic Regression

In [None]:
print('Train metrics:')
evaluate(log_reg, X_train, y_train)
print('\n')
print('Test metrics:')
evaluate(log_reg, X_test, y_test)
1

Train metrics:
Accuracy: 0.8954816812630686
F1: 0.7514592491172444
Precision: 0.7702762594179347
Recall: 0.7335396736072032


Test metrics:
Accuracy: 0.90075
F1: 0.7872834434720486
Precision: 0.8141854451422239
Recall: 0.7621023513139695


(0.90075, 0.7872834434720486, 0.8141854451422239, 0.7872834434720486)

## Stacked Result

In [28]:
y_train_pred = meta_classifier.predict(X_train)
y_train_pred

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [29]:
print('Train metrics:')
evaluate(meta_classifier, X_train, y_train)
print('\n')
print('Test metrics:')
evaluate(meta_classifier, X_test, y_test)
0

Train metrics:
Accuracy: 0.9550289402709173
F1: 0.8915363250986698
Precision: 0.9277456647398844
Recall: 0.8580472706809229


Test metrics:
Accuracy: 0.9300833333333334
F1: 0.848964896489649
Precision: 0.8854675178370259
Recall: 0.8153526970954357


0