In [1]:
# Base & Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score

# Oversampling
from imblearn.combine import  SMOTETomek

# Modeling
from catboost import CatBoostClassifier, CatBoostRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression

# Scoring
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report

# BayesianOptimizer
from bayes_opt import BayesianOptimization

# Saving Model
import joblib

# Data Generate

In [2]:
# Data Load
train = pd.read_csv('data/part/one-hot_encoding/iterative/part_one_iter_train.csv')
label = pd.read_csv('data/raw_label.csv')
test = pd.read_csv('data/one_hot_test.csv')

train_x = np.array(train)
train_y = np.array(label)
test_x = np.array(test)

In [3]:
# Data split
x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, test_size = 0.3, random_state = 17, stratify = train_y)

In [4]:
# Oversampling - train
tomek = SMOTETomek(random_state = 17, n_jobs = -1)
tomek_x_train, tomek_y_train = tomek.fit_sample(x_train, y_train)
print(tomek_x_train.shape, tomek_y_train.shape)

(17868, 53) (17868,)


In [5]:
# Making tomek_s_train & x_s_val for Task2
tomek_s_train = np.array(pd.DataFrame(tomek_x_train, columns = train.columns).loc[:,test.columns])
x_s_val = np.array(pd.DataFrame(x_val, columns = train.columns).loc[:,test.columns])

# Generate Hyperparameters by Model

In [6]:
# Catboost
cb_pbounds = { 'n_estimators' : (50, 500),
               'learning_rate' : (0.001, 0.5),
               'max_depth' : (3, 10),
               'random_strength' : (0, 0.9),
               'l2_leaf_reg' : (1, 50) }

# XGB
xgb_pbounds = { 'n_estimators' : (50, 500),
                'learning_rate' : (0.001, 0.5),
                'max_depth' : (3, 10),
                'subsample' : (0.7, 1),
                'colsample_bytree' : (0.7, 1),
                'reg_alpha' : (0.01, 10),
                'reg_lambda' : (0.01, 10),
                'gamma' : (0.01, 10),
                'min_child_weight' : (1, 15) }

# LGBM
lgbm_pbounds = { 'n_estimators' : (50, 500),
                 'learning_rate' : (0.001, 0.5),
                 'max_depth' : (3, 10),
                 'min_child_samples' : (5,50),
                 'subsample' : (0.7, 1),
                 'colsample_bytree' : (0.7, 1),
                 'reg_alpha' : (0.01, 10),
                 'reg_lambda' : (0.01, 10),
                 'min_child_weight' : (1, 15),
                 'min_split_gain' : (10,100) }

# ExtraTree
et_pbounds = { 'n_estimators' : (50, 500), 'max_depth' : (3, 10),
               'min_samples_split' : (1, 15), 'min_samples_leaf' : (2, 15) }

# Logistic
lr_pbounds = { 'max_iter' : (50, 500), 'C' : (0.01, 1) }

# Task1 : Making a Label predict_proba at y_train

### Generate Functoin for Bayesian Optimizer (Classifier)

In [10]:
def cb_clf_opt(n_estimators, learning_rate, max_depth, random_strength, l2_leaf_reg):
    
    params = { 'n_estimators' : int(round(n_estimators)),
               'learning_rate' : learning_rate,
               'max_depth' : int(round(max_depth)),
               'random_strength' : random_strength,
               'l2_leaf_reg' : max_iter}
    
    cb = CatBoostClassifier(**params)
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
    score = cross_val_score(cb, tomek_x_train, tomek_y_train, scoring = 'f1_macro', cv = skf, n_jobs = -1)
    
    return np.mean(score)

def xgb_clf_opt(n_estimators, learning_rate, max_depth, subsample, colsample_bytree, reg_alpha, reg_lambda, gamma, min_child_weight):
    
    params = { 'n_estimators' : int(round(n_estimators)),
               'learning_rate' : learning_rate,
               'max_depth' : int(round(max_depth)),
               'subsample' : subsample,
               'colsample_bytree' : colsample_bytree,
               'reg_alpha' : reg_alpha,
               'reg_lambda' : reg_lambda,
               'gamma' : gamma,
               'min_child_weight' : min_child_weight, 'n_jobs' : -1 }
    
    xgb = XGBClassifier(**params)
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
    score = cross_val_score(xgb, tomek_x_train, tomek_y_train, scoring = 'f1_macro', cv = skf, n_jobs = -1)
    
    return np.mean(score)

def lgbm_clf_opt(n_estimators, learning_rate, max_depth, min_child_samples, subsample, colsample_bytree, 
                 reg_alpha, reg_lambda, min_child_weight, min_split_gain):
    
    params = { 'n_estimators' : int(round(n_estimators)),
               'learning_rate' : learning_rate,
               'max_depth' : int(round(max_depth)),
               'min_child_samples' : int(round(min_child_samples)),
               'subsample' : subsample,
               'colsample_bytree' : colsample_bytree,
               'reg_alpha' : reg_alpha,
               'reg_lambda' : reg_lambda,
               'min_child_weight' : min_child_weight,
               'min_split_gain' : min_split_gain, 'n_jobs' : -1 }
    
    lgbm = LGBMClassifier(**params)
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
    score = cross_val_score(lgbm, tomek_x_train, tomek_y_train, scoring = 'f1_macro', cv = skf, n_jobs = -1)
    
    return np.mean(score)

def et_clf_opt(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    
    params = { 'n_estimators' : int(round(n_estimators)),
               'max_depth' : int(round(max_depth)),
               'min_samples_split' : int(round(min_samples_split)),
               'min_samples_leaf' : int(round(min_samples_leaf)), 'n_jobs' : -1 }
    
    et = ExtraTreesClassifier(**params)
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
    score = cross_val_score(et, tomek_x_train, tomek_y_train, scoring = 'f1_macro', cv = skf, n_jobs = -1)
    
    return np.mean(score)

def lr_clf_opt(n_estimators, C):
    
    params = { 'max_iter' : int(round(max_iter)), 'C' : C, 'n_jobs' : -1 }
    
    lr = LogisticRegression(**params)
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
    score = cross_val_score(lr, tomek_x_train, tomek_y_train, scoring = 'f1_macro', cv = skf, n_jobs = -1)
    
    return np.mean(score)

### Generate BayesianOptimization

In [11]:
BO_cb_clf = BayesianOptimization(f = cb_clf_opt, pbounds = cb_pbounds, random_state=17)
BO_xgb_clf = BayesianOptimization(f = xgb_clf_opt, pbounds = xgb_pbounds, random_state=17)
BO_lgbm_clf = BayesianOptimization(f = lgbm_clf_opt, pbounds = lgbm_pbounds, random_state=17)
BO_et_clf = BayesianOptimization(f = et_clf_opt, pbounds = et_pbounds, random_state=17)
BO_lr_clf = BayesianOptimization(f = lr_clf_opt, pbounds = lr_pbounds, random_state=17)

### Optimizer Maximize

In [13]:
# Catboost Maximize
BO_cb_clf.maximize(init_points=50, n_iter=150, acq = 'poi')

|   iter    |  target   | l2_lea... | learni... | max_depth | n_esti... | random... |
-------------------------------------------------------------------------------------
| [95m2        [0m | [95m0.9739   [0m | [95m47.34    [0m | [95m0.03096  [0m | [95m9.048    [0m | [95m444.8    [0m | [95m0.04607  [0m |
| [95m3        [0m | [95m0.9752   [0m | [95m32.97    [0m | [95m0.2763   [0m | [95m7.183    [0m | [95m267.6    [0m | [95m0.2547   [0m |
| [0m4        [0m | [0m0.974    [0m | [0m15.59    [0m | [0m0.2812   [0m | [0m5.772    [0m | [0m404.9    [0m | [0m0.3766   [0m |
| [0m5        [0m | [0m0.9728   [0m | [0m8.051    [0m | [0m0.0763   [0m | [0m3.387    [0m | [0m373.1    [0m | [0m0.2631   [0m |
| [0m6        [0m | [0m0.9731   [0m | [0m10.74    [0m | [0m0.4159   [0m | [0m6.976    [0m | [0m87.05    [0m | [0m0.4905   [0m |
| [0m7        [0m | [0m0.9728   [0m | [0m8.789    [0m | [0m0.3387   [0m | [0m3.829    [0m | [

| [0m59       [0m | [0m0.9693   [0m | [0m35.76    [0m | [0m0.03196  [0m | [0m9.832    [0m | [0m185.8    [0m | [0m0.5912   [0m |
| [0m60       [0m | [0m0.975    [0m | [0m39.98    [0m | [0m0.4377   [0m | [0m7.942    [0m | [0m348.5    [0m | [0m0.4312   [0m |
| [0m61       [0m | [0m0.975    [0m | [0m18.59    [0m | [0m0.4772   [0m | [0m7.264    [0m | [0m435.6    [0m | [0m0.7924   [0m |
| [0m62       [0m | [0m0.97     [0m | [0m9.781    [0m | [0m0.1092   [0m | [0m3.248    [0m | [0m117.7    [0m | [0m0.1038   [0m |
| [0m63       [0m | [0m0.9736   [0m | [0m45.28    [0m | [0m0.4901   [0m | [0m6.569    [0m | [0m250.9    [0m | [0m0.6162   [0m |
| [0m64       [0m | [0m0.9737   [0m | [0m7.079    [0m | [0m0.3497   [0m | [0m6.076    [0m | [0m345.1    [0m | [0m0.7173   [0m |
| [0m65       [0m | [0m0.9727   [0m | [0m27.87    [0m | [0m0.1533   [0m | [0m5.304    [0m | [0m144.7    [0m | [0m0.7087   [0m |
| [0m

| [0m117      [0m | [0m0.9756   [0m | [0m4.435    [0m | [0m0.1745   [0m | [0m7.126    [0m | [0m372.7    [0m | [0m0.1458   [0m |
| [0m118      [0m | [0m0.9531   [0m | [0m26.13    [0m | [0m0.00447  [0m | [0m9.288    [0m | [0m424.7    [0m | [0m0.1655   [0m |
| [0m119      [0m | [0m0.9755   [0m | [0m4.051    [0m | [0m0.1852   [0m | [0m6.66     [0m | [0m372.8    [0m | [0m0.7373   [0m |
| [0m120      [0m | [0m0.9752   [0m | [0m46.12    [0m | [0m0.2889   [0m | [0m8.484    [0m | [0m363.1    [0m | [0m0.1357   [0m |
| [0m121      [0m | [0m0.9746   [0m | [0m46.07    [0m | [0m0.3548   [0m | [0m7.638    [0m | [0m494.3    [0m | [0m0.5338   [0m |
| [95m122      [0m | [95m0.9772   [0m | [95m1.929    [0m | [95m0.2464   [0m | [95m9.697    [0m | [95m291.9    [0m | [95m0.3514   [0m |
| [0m123      [0m | [0m0.9763   [0m | [0m4.157    [0m | [0m0.21     [0m | [0m7.541    [0m | [0m373.9    [0m | [0m0.7419   [0m |

| [0m175      [0m | [0m0.9736   [0m | [0m18.25    [0m | [0m0.03035  [0m | [0m9.985    [0m | [0m279.9    [0m | [0m0.4545   [0m |
| [0m176      [0m | [0m0.971    [0m | [0m22.47    [0m | [0m0.2191   [0m | [0m3.077    [0m | [0m109.1    [0m | [0m0.5347   [0m |
| [0m177      [0m | [0m0.9748   [0m | [0m17.78    [0m | [0m0.4574   [0m | [0m7.654    [0m | [0m281.0    [0m | [0m0.6082   [0m |
| [0m178      [0m | [0m0.9741   [0m | [0m13.26    [0m | [0m0.4208   [0m | [0m4.439    [0m | [0m336.9    [0m | [0m0.4135   [0m |
| [0m179      [0m | [0m0.9743   [0m | [0m2.367    [0m | [0m0.4799   [0m | [0m9.302    [0m | [0m358.2    [0m | [0m0.4284   [0m |
| [0m180      [0m | [0m0.9734   [0m | [0m38.97    [0m | [0m0.07933  [0m | [0m9.875    [0m | [0m214.0    [0m | [0m0.7928   [0m |
| [0m181      [0m | [0m0.9759   [0m | [0m1.962    [0m | [0m0.1515   [0m | [0m8.232    [0m | [0m289.8    [0m | [0m0.2042   [0m |
| [0m

| [0m233      [0m | [0m0.9739   [0m | [0m21.57    [0m | [0m0.485    [0m | [0m6.858    [0m | [0m263.1    [0m | [0m0.09545  [0m |
| [0m234      [0m | [0m0.9766   [0m | [0m1.392    [0m | [0m0.1942   [0m | [0m8.668    [0m | [0m289.5    [0m | [0m0.08316  [0m |
| [0m235      [0m | [0m0.9736   [0m | [0m14.7     [0m | [0m0.1374   [0m | [0m5.246    [0m | [0m133.0    [0m | [0m0.1908   [0m |
| [0m236      [0m | [0m0.9736   [0m | [0m45.17    [0m | [0m0.3711   [0m | [0m4.168    [0m | [0m481.0    [0m | [0m0.3645   [0m |
| [0m237      [0m | [0m0.9743   [0m | [0m48.25    [0m | [0m0.3601   [0m | [0m8.591    [0m | [0m151.1    [0m | [0m0.4667   [0m |
| [0m238      [0m | [0m0.9766   [0m | [0m1.923    [0m | [0m0.2381   [0m | [0m8.752    [0m | [0m289.1    [0m | [0m0.7409   [0m |
| [0m239      [0m | [0m0.9749   [0m | [0m9.467    [0m | [0m0.1918   [0m | [0m5.751    [0m | [0m393.6    [0m | [0m0.4987   [0m |
| [0m

In [14]:
# XGB Maximize
BO_xgb_clf.maximize(init_points=50, n_iter=150, acq = 'poi')

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9708   [0m | [0m0.7884   [0m | [0m5.311    [0m | [0m0.09657  [0m | [0m3.475    [0m | [0m12.02    [0m | [0m345.4    [0m | [0m6.379    [0m | [0m5.76     [0m | [0m0.7117   [0m |
| [0m2        [0m | [0m0.9532   [0m | [0m0.8073   [0m | [0m9.457    [0m | [0m0.03096  [0m | [0m9.048    [0m | [0m13.28    [0m | [0m73.04    [0m | [0m6.528    [0m | [0m5.522    [0m | [0m0.8793   [0m |
| [95m3        [0m | [95m0.9748   [0m | [95m0.8451   [0m | [95m2.837    [0m | [95m0.1496   [0m | [95m6.931    [0m | [95m6.545    [0m | [95m404.9    [0m | [95m4.191    [0m | [95m1.448    [0m | [95m0.7453   [0m |
| [0m4        [0m | [0m0.9726   [0m | [0m0.7166   

| [0m37       [0m | [0m0.9722   [0m | [0m0.9391   [0m | [0m5.488    [0m | [0m0.1533   [0m | [0m5.304    [0m | [0m3.946    [0m | [0m404.4    [0m | [0m4.048    [0m | [0m7.102    [0m | [0m0.7863   [0m |
| [0m38       [0m | [0m0.9747   [0m | [0m0.7858   [0m | [0m3.34     [0m | [0m0.2532   [0m | [0m9.453    [0m | [0m11.1     [0m | [0m170.1    [0m | [0m0.384    [0m | [0m4.428    [0m | [0m0.7869   [0m |
| [0m39       [0m | [0m0.974    [0m | [0m0.9316   [0m | [0m4.012    [0m | [0m0.09393  [0m | [0m9.218    [0m | [0m12.33    [0m | [0m204.0    [0m | [0m2.884    [0m | [0m6.203    [0m | [0m0.9314   [0m |
| [0m40       [0m | [0m0.9739   [0m | [0m0.9106   [0m | [0m8.793    [0m | [0m0.2943   [0m | [0m4.977    [0m | [0m9.462    [0m | [0m340.0    [0m | [0m0.7467   [0m | [0m0.9894   [0m | [0m0.8237   [0m |
| [0m41       [0m | [0m0.9746   [0m | [0m0.8013   [0m | [0m3.453    [0m | [0m0.2471   [0m | [0m5.214 

| [0m74       [0m | [0m0.9749   [0m | [0m0.9658   [0m | [0m1.716    [0m | [0m0.2157   [0m | [0m7.208    [0m | [0m6.075    [0m | [0m405.3    [0m | [0m2.532    [0m | [0m2.498    [0m | [0m0.7547   [0m |
| [0m75       [0m | [0m0.9383   [0m | [0m0.7646   [0m | [0m2.31     [0m | [0m0.001216 [0m | [0m6.293    [0m | [0m3.762    [0m | [0m246.8    [0m | [0m2.707    [0m | [0m5.811    [0m | [0m0.9249   [0m |
| [0m76       [0m | [0m0.9733   [0m | [0m0.7551   [0m | [0m2.109    [0m | [0m0.3882   [0m | [0m6.785    [0m | [0m14.24    [0m | [0m168.1    [0m | [0m5.187    [0m | [0m8.813    [0m | [0m0.7337   [0m |
| [0m77       [0m | [0m0.9732   [0m | [0m0.864    [0m | [0m6.886    [0m | [0m0.06715  [0m | [0m6.487    [0m | [0m8.187    [0m | [0m338.6    [0m | [0m1.507    [0m | [0m6.243    [0m | [0m0.8767   [0m |
| [0m78       [0m | [0m0.9709   [0m | [0m0.9026   [0m | [0m8.195    [0m | [0m0.3745   [0m | [0m6.534 

| [0m111      [0m | [0m0.9753   [0m | [0m0.9079   [0m | [0m1.679    [0m | [0m0.1713   [0m | [0m7.342    [0m | [0m5.778    [0m | [0m404.7    [0m | [0m3.917    [0m | [0m0.4521   [0m | [0m0.8816   [0m |
| [95m112      [0m | [95m0.9758   [0m | [95m0.7807   [0m | [95m1.099    [0m | [95m0.206    [0m | [95m7.783    [0m | [95m5.639    [0m | [95m404.8    [0m | [95m3.448    [0m | [95m2.583    [0m | [95m0.9521   [0m |
| [0m113      [0m | [0m0.9739   [0m | [0m0.8499   [0m | [0m6.035    [0m | [0m0.3013   [0m | [0m9.178    [0m | [0m3.151    [0m | [0m415.7    [0m | [0m3.853    [0m | [0m0.3479   [0m | [0m0.8482   [0m |
| [0m114      [0m | [0m0.9733   [0m | [0m0.9082   [0m | [0m7.132    [0m | [0m0.2147   [0m | [0m6.14     [0m | [0m13.16    [0m | [0m436.3    [0m | [0m2.305    [0m | [0m1.004    [0m | [0m0.9025   [0m |
| [0m115      [0m | [0m0.9752   [0m | [0m0.9713   [0m | [0m0.9514   [0m | [0m0.08108  [0m |



| [0m120      [0m | [0m0.9749   [0m | [0m0.9627   [0m | [0m0.1561   [0m | [0m0.1064   [0m | [0m7.044    [0m | [0m6.021    [0m | [0m407.1    [0m | [0m3.32     [0m | [0m3.376    [0m | [0m0.8046   [0m |
| [0m121      [0m | [0m0.9751   [0m | [0m0.7445   [0m | [0m1.987    [0m | [0m0.2922   [0m | [0m7.19     [0m | [0m5.127    [0m | [0m404.8    [0m | [0m2.339    [0m | [0m2.404    [0m | [0m0.9908   [0m |
| [0m122      [0m | [0m0.9737   [0m | [0m0.7211   [0m | [0m6.834    [0m | [0m0.332    [0m | [0m5.87     [0m | [0m12.09    [0m | [0m436.0    [0m | [0m2.593    [0m | [0m0.7125   [0m | [0m0.8883   [0m |
| [0m123      [0m | [0m0.9753   [0m | [0m0.7979   [0m | [0m3.57     [0m | [0m0.1006   [0m | [0m7.864    [0m | [0m6.491    [0m | [0m403.6    [0m | [0m2.944    [0m | [0m3.009    [0m | [0m0.9265   [0m |
| [0m124      [0m | [0m0.9752   [0m | [0m0.876    [0m | [0m2.285    [0m | [0m0.1163   [0m | [0m7.165 

KeyboardInterrupt: 

In [None]:
# LGBM Maximize
BO_lgbm_clf.maximize(init_points=50, n_iter=150, acq = 'poi')

In [None]:
# ExtraTrees Maximize
BO_et_clf.maximize(init_points=50, n_iter=150, acq = 'poi')

In [None]:
# Logistic Maximize
BO_lr_clf.maximize(init_points=50, n_iter=150, acq = 'poi')

### Change Max parameters from float to int

In [None]:
# Catboost Max Parameters
cb_clf_max_params = BO_cb_clf.max['params']
cb_clf_max_params['n_estimators'] = int(round(cb_clf_max_params['n_estimators']))
cb_clf_max_params['max_depth'] = int(round(cb_clf_max_params['max_depth']))

# XGB Max Parameters
xgb_clf_max_params = BO_xgb_clf.max['params']
xgb_clf_max_params['n_estimators'] = int(round(xgb_clf_max_params['n_estimators']))
xgb_clf_max_params['max_depth'] = int(round(xgb_clf_max_params['max_depth']))

In [None]:
# LGBM Max Parameters
lgbm_clf_max_params = BO_lgbm_clf.max['params']
lgbm_clf_max_params['n_estimators'] = int(round(lgbm_clf_max_params['n_estimators']))
lgbm_clf_max_params['max_depth'] = int(round(lgbm_clf_max_params['max_depth']))
lgbm_clf_max_params['min_child_samples'] = int(round(lgbm_clf_max_params['min_child_samples']))

# ExtraTrees Max Parameters
et_clf_max_params = BO_et_clf.max['params']
et_clf_max_params['n_estimators'] = int(round(et_clf_max_params['n_estimators']))
et_clf_max_params['max_depth'] = int(round(et_clf_max_params['max_depth']))
et_clf_max_params['min_samples_split'] = int(round(et_clf_max_params['min_samples_split']))
et_clf_max_params['min_samples_leaf'] = int(round(et_clf_max_params['min_samples_leaf']))

# Logistic Max Parameters
lr_clf_max_params = BO_lr_clf.max['params']
lr_clf_max_params['max_iter'] = int(round(lr_clf_max_params['max_iter']))

### Retraing the Model with best parameters

In [None]:
tune_cb_clf = CatBoostClassifier(**cb_clf_max_params)
tune_cb_clf.fit(tomek_x_train,tomek_y_train)

tune_xgb_clf = XGBClassifier(**xgb_clf_max_params)
tune_xgb_clf.fit(tomek_x_train,tomek_y_train)

In [None]:
tune_lgbm_clf = LGBMClassifier(**lgbm_clf_max_params)
tune_lgbm_clf.fit(tomek_x_train,tomek_y_train)

tune_et_clf = ExtraTreesClassifier(**et_clf_max_params)
tune_et_clf.fit(tomek_x_train,tomek_y_train)

tune_lr_clf = LogisticRegression(**lr_clf_max_params)
tune_lr_clf.fit(tomek_x_train,tomek_y_train)

### Validation Check

In [None]:
# Predict x_val by Tuned Model
tune_cb_clf_pred = tune_cb_clf.predict(x_val)
tune_xgb_clf_pred = tune_xgb_clf.predict(x_val)

In [None]:
tune_lgbm_clf_pred = tune_lgbm_clf.predict(x_val)
tune_et_clf_pred = tune_et_clf.predict(x_val)
tune_lr_clf_pred = tune_lr_clf.predict(x_val)

In [None]:
# Validation Score
tune_cb_clf_val_score = f1_score(y_val, tune_cb_clf_pred, average = 'macro')
tune_xgb_clf_val_score = f1_score(y_val, tune_xgb_clf_pred, average = 'macro')
#tune_lgbm_clf_val_score = f1_score(y_val, tune_lgbm_clf_pred, average = 'macro')
#tune_et_clf_val_score = f1_score(y_val, tune_et_clf_pred, average = 'macro')
#tune_lr_clf_val_score = f1_score(y_val, tune_lr_clf_pred, average = 'macro')
print(f'Catboost F1 score:{tune_cb_clf_val_score}\nXGB F1 score:{tune_xgb_clf_val_score}')
#print(f'LGBM F1 score:{tune_lgbm_clf_val_score}\nExtraTrees F1 score:{tune_et_clf_val_score}\nLogistic F1 score:{tune_lr_clf_val_score}')

### Generate predict_proba

In [None]:
tune_cb_clf_pred_proba = np.arrary(i[1] for i in tune_cb_clf.predict_proba(tomek_x_train))
tune_xgb_clf_pred_proba = np.arrary(i[1] for i in tune_xgb_clf.predict_proba(tomek_x_train))
#tune_lgbm_clf_pred_proba = np.arrary(i[1] for i in tune_lgbm_clf.predict_proba(tomek_x_train))
#tune_et_clf_pred_proba = np.arrary(i[1] for i in tune_et_clf.predict_proba(tomek_x_train))
#tune_lr_clf_pred_proba = np.arrary(i[1] for i in tune_lr_clf.predict_proba(tomek_x_train))

## Saving Task1 Model

In [None]:
joblib.dump(tune_cb_clf, 'model_save/tuning/task1_cb_clf.pkl')
joblib.dump(tune_xgb_clf, 'model_save/tuning/task1_xgb_clf.pkl')
#joblib.dump(tune_lgbm_clf, 'model_save/tuning/task1_lgbm_clf.pkl')
#joblib.dump(tune_et_clf, 'model_save/tuning/task1_et_clf.pkl')
#joblib.dump(tune_lr_clf, 'model_save/tuning/task1_lr_clf.pkl')

# Task2 : Predict Test Label by Regressor

### Generate Functoin for Bayesian Optimizer (Regressor)

In [None]:
def cb_reg_opt(n_estimators, learning_rate, max_depth, random_strength, l2_leaf_reg):
    
    params = { 'n_estimators' : int(round(n_estimators)),
               'learning_rate' : learning_rate,
               'max_depth' : int(round(max_depth)),
               'random_strength' : random_strength,
               'l2_leaf_reg' : l2_leaf_reg }
    
    cb = CatBoostRegressor(**params)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 17)
    score = cross_val_score(cb, tomek_s_train, ***, scoring = 'neg_mean_squared_error', cv = skf, n_jobs = -1)
    
    return np.mean(score)

def xgb_reg_opt(n_estimators, learning_rate, max_depth, subsample, colsample_bytree, 
                reg_alpha, reg_lambda, gamma, min_child_weight):
    
    params = { 'n_estimators' : int(round(n_estimators)),
               'learning_rate' : learning_rate,
               'max_depth' : int(round(max_depth)),
               'subsample' : subsample,
               'colsample_bytree' : colsample_bytree,
               'reg_alpha' : reg_alpha,
               'reg_lambda' : reg_lambda,
               'gamma' : gamma,
               'min_child_weight' : min_child_weight, 'n_jobs' : -1 }
    
    xgb = XGBRegressor(**params)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 17)
    score = cross_val_score(xgb, tomek_s_train, ***, scoring = 'neg_mean_squared_error', cv = skf, n_jobs = -1)
    
    return np.mean(score)

def lgbm_reg_opt(n_estimators, learning_rate, max_depth, min_child_samples, subsample, colsample_bytree, 
                reg_alpha, reg_lambda, min_child_weight, min_split_gain):
    
    params = { 'n_estimators' : int(round(n_estimators)),
               'learning_rate' : learning_rate,
               'max_depth' : int(round(max_depth)),
               'min_child_samples' : int(round(min_child_samples)),
               'subsample' : subsample,
               'colsample_bytree' : colsample_bytree,
               'reg_alpha' : reg_alpha,
               'reg_lambda' : reg_lambda,
               'min_child_weight' : min_child_weight,
               'min_split_gain' : min_split_gain, 'n_jobs' : -1 }
    
    lgbm = LGBMRegressor(**params)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 17)
    score = cross_val_score(lgbm, tomek_s_train, ***, scoring = 'neg_mean_squared_error', cv = skf, n_jobs = -1)
    
    return np.mean(score)

def et_reg_opt(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    
    params = { 'n_estimators' : int(round(n_estimators)),
               'max_depth' : int(round(max_depth)),
               'min_samples_split' : int(round(min_samples_split)),
               'min_samples_leaf' : int(round(min_samples_leaf)), 'n_jobs' : -1 }
    
    et = ExtraTreesRegressor(**params)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 17)
    score = cross_val_score(et, tomek_s_train, ***, scoring = 'neg_mean_squared_error', cv = skf, n_jobs = -1)
    
    return np.mean(score)

def lr_reg_opt(n_estimators, C):
    
    params = { 'max_iter' : int(round(max_iter)), 'C' : C, 'n_jobs' : -1 }
    
    lr = LogisticRegression(**params)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 17)
    score = cross_val_score(lr, tomek_s_train, ***, scoring = 'neg_mean_squared_error', cv = skf, n_jobs = -1)
    
    return np.mean(score)

### Generate BayesianOptimization

In [None]:
BO_cb_reg = BayesianOptimization(f = cb_reg_opt, pbounds = cb_pbounds, random_state=17)
BO_xgb_reg = BayesianOptimization(f = xgb_reg_opt, pbounds = xgb_pbounds, random_state=17)
BO_lgbm_reg = BayesianOptimization(f = lgbm_reg_opt, pbounds = lgbm_pbounds, random_state=17)
BO_et_reg = BayesianOptimization(f = et_reg_opt, pbounds = et_pbounds, random_state=17)
BO_lr_reg = BayesianOptimization(f = lr_reg_opt, pbounds = lr_pbounds, random_state=17)

### Optimizer Maximize

In [None]:
BO_cb_reg.maximize(init_points=50, n_iter=150, acq = 'ei', xi = 0.01)
BO_xgb_reg.maximize(init_points=50, n_iter=150, acq = 'ei', xi = 0.01)
BO_lgbm_reg.maximize(init_points=50, n_iter=150, acq = 'ei', xi = 0.01)
BO_et_reg.maximize(init_points=50, n_iter=150, acq = 'ei', xi = 0.01)
BO_lr_reg.maximize(init_points=50, n_iter=150, acq = 'ei', xi = 0.01)

### Change Max parameters from float to int

In [None]:
# Catboost Max Parameters
cb_reg_max_params = BO_cb_reg.max['params']
cb_reg_max_params['n_estimators'] = int(round(cb_reg_max_params['n_estimators']))
cb_reg_max_params['max_depth'] = int(round(cb_reg_max_params['max_depth']))

# XGB Max Parameters
xgb_reg_max_params = BO_xgb_reg.max['params']
xgb_reg_max_params['n_estimators'] = int(round(xgb_reg_max_params['n_estimators']))
xgb_reg_max_params['max_depth'] = int(round(xgb_reg_max_params['max_depth']))

# LGBM Max Parameters
lgbm_reg_max_params = BO_lgbm_reg.max['params']
lgbm_reg_max_params['n_estimators'] = int(round(lgbm_reg_max_params['n_estimators']))
lgbm_reg_max_params['max_depth'] = int(round(lgbm_reg_max_params['max_depth']))
lgbm_reg_max_params['min_child_samples'] = int(round(lgbm_reg_max_params['min_child_samples']))

# ExtraTrees Max Parameters
et_reg_max_params = BO_et_reg.max['params']
et_reg_max_params['n_estimators'] = int(round(et_reg_max_params['n_estimators']))
et_reg_max_params['max_depth'] = int(round(et_reg_max_params['max_depth']))
et_reg_max_params['min_samples_split'] = int(round(et_reg_max_params['min_samples_split']))
et_reg_max_params['min_samples_leaf'] = int(round(et_reg_max_params['min_samples_leaf']))

# Logistic Max Parameters
lr_reg_max_params = BO_lr_reg.max['params']
lr_reg_max_params['max_iter'] = int(round(lr_reg_max_params['max_iter']))

### Retraing the Model with best parameters

In [None]:
tune_cb_reg = CatBoostRegressor(**cb_reg_max_params)
tune_cb_reg.fit(tomek_s_train,tomek_y_train)

tune_xgb_reg = XGBRegressor(**xgb_reg_max_params)
tune_xgb_reg.fit(tomek_s_train,tomek_y_train)

tune_lgbm_reg = LGBMRegressor(**lgbm_reg_max_params)
tune_lgbm_reg.fit(tomek_s_train,tomek_y_train)

tune_et_reg = ExtraTreesRegressor(**et_reg_max_params)
tune_et_reg.fit(tomek_s_train,tomek_y_train)

tune_lr_reg = LogisticRegressionr(**lr_reg_max_params)
tune_lr_reg.fit(tomek_s_train,tomek_y_train)

### Validation Check

In [None]:
# Predict x_s_val by Tuned Model
tune_cb_reg_val_pred = tune_cb_reg.predict(x_s_val)
tune_xgb_reg_val_pred = tune_xgb_reg.predict(x_s_val)
tune_lgbm_reg_val_pred = tune_lgbm_reg.predict(x_s_val)
tune_et_reg_val_pred = tune_et_reg.predict(x_s_val)
tune_lr_reg_val_pred = tune_lr_reg.predict(x_s_val)

In [None]:
# Visualize Validation predict
cb_val_pred = pd.DataFrame({'predict':tune_cb_reg_val_pred})
xgb_val_pred = pd.DataFrame({'predict':tune_xgb_reg_val_pred})
lgbm_val_pred = pd.DataFrame({'predict':tune_lgbm_reg_val_pred})
et_val_pred = pd.DataFrame({'predict':tune_et_reg_val_pred})
lr_val_pred = pd.DataFrame({'predict':tune_lr_reg_val_pred})

plt.figure(figsize=(20,10))

plt.subplot(3,2,1)
sns.kdeplot(cb_val_pred.predict)
plt.title('Caboost Predict KDE (val)')
plt.show()

plt.subplot(3,2,2)
sns.kdeplot(xgb_val_pred.predict)
plt.title('XGB Predict KDE (val)')
plt.show()

plt.subplot(3,2,3)
sns.kdeplot(lgbm_val_pred.predict)
plt.title('LGBM Predict KDE (val)')
plt.show()

plt.subplot(3,2,4)
sns.kdeplot(et_val_pred.predict)
plt.title('ExtraTrees Predict KDE (val)')
plt.show()

plt.subplot(3,2,5)
sns.kdeplot(lr_val_pred.predict)
plt.title('Logistic Predict KDE (val)')
plt.show()

In [None]:
# Check 8.5% raw at Validation predict
print(cb_val_pred.predict.sort_values(ascending = False).iloc[510:521])
print(xgb_val_pred.predict.sort_values(ascending = False).iloc[510:521])
print(lgbm_val_pred.predict.sort_values(ascending = False).iloc[510:521])
print(et_val_pred.predict.sort_values(ascending = False).iloc[510:521])
print(lr_val_pred.predict.sort_values(ascending = False).iloc[510:521])

In [None]:
# predict transform by threshold
cb_trans_val_pred = np.array([0 if i < * else 1 for i in tune_cb_reg_val_pred])
xgb_trans_val_pred = np.array([0 if i < * else 1 for i in tune_xgb_reg_val_pred])
lgbm_trans_val_pred = np.array([0 if i < * else 1 for i in tune_lgbm_reg_val_pred])
et_trans_val_pred = np.array([0 if i < * else 1 for i in tune_et_reg_val_pred])
lr_trans_val_pred = np.array([0 if i < * else 1 for i in tune_lr_reg_val_pred])

In [None]:
# Validation F1-macro Score & ConfusionMatrix
tune_cb_reg_val_score = f1_score(y_val, cb_trans_val_pred, average = 'macro')
tune_xgb_reg_val_score = f1_score(y_val, xgb_trans_val_pred, average = 'macro')
tune_lgbm_reg_val_score = f1_score(y_val, tune_lgbm_reg_pred, average = 'macro')
tune_et_reg_val_score = f1_score(y_val, et_trans_val_pred, average = 'macro')
tune_lr_reg_val_score = f1_score(y_val, lr_trans_val_pred, average = 'macro')

print(f'Catboost F1 score : {tune_cb_reg_val_score}\n', classification_report(y_val, cb_trans_val_pred))
print(f'XGB F1 score : {tune_xgb_reg_val_score}\n', classification_report(y_val, xgb_trans_val_pred))
print(f'LGBM F1 score : {tune_lgbm_reg_val_score}\n', classification_report(y_val, lgbm_trans_val_pred))
print(f'ExtraTrees F1 score : {tune_et_reg_val_score}\n', classification_report(y_val, et_trans_val_pred))
print(f'Logistic F1 score : {tune_lr_reg_val_score}\n', classification_report(y_val, lr_trans_val_pred))

### Generate Test Predict

In [None]:
# Predict target by test_x
tune_cb_reg_pred = tune_cb_reg.predict(test_x)
tune_xgb_reg_pred = tune_xgb_reg.predict(test_x)
tune_lgbm_reg_pred = tune_lgbm_reg.predict(test_x)
tune_et_reg_pred = tune_et_reg.predict(test_x)
tune_lr_reg_pred = tune_lr_reg.predict(test_x)

In [None]:
# Visualize target predict
cb_pred = pd.DataFrame({'predict':tune_cb_reg_pred})
xgb_pred = pd.DataFrame({'predict':tune_xgb_reg_pred})
lgbm_pred = pd.DataFrame({'predict':tune_lgbm_reg_pred})
et_pred = pd.DataFrame({'predict':tune_et_reg_pred})
lr_pred = pd.DataFrame({'predict':tune_lr_reg_pred})

plt.figure(figsize=(20,10))

plt.subplot(3,2,1)
sns.kdeplot(cb_pred.predict)
plt.title('Caboost Predict KDE (predict)')
plt.show()

plt.subplot(3,2,2)
sns.kdeplot(xgb_pred.predict)
plt.title('XGB Predict KDE (predict)')
plt.show()

plt.subplot(3,2,3)
sns.kdeplot(lgbm_pred.predict)
plt.title('LGBM Predict KDE (predict)')
plt.show()

plt.subplot(3,2,4)
sns.kdeplot(et_pred.predict)
plt.title('ExtraTrees Predict KDE (predict)')
plt.show()

plt.subplot(3,2,5)
sns.kdeplot(lr_pred.predict)
plt.title('Logistic Predict KDE (predict)')
plt.show()

In [None]:
# target predict 8.5% raw print by Model
print(cb_pred.predict.sort_values(ascending = False).iloc[510:521])
print(xgb_pred.predict.sort_values(ascending = False).iloc[510:521])
print(lgbm_pred.predict.sort_values(ascending = False).iloc[510:521])
print(et_pred.predict.sort_values(ascending = False).iloc[510:521])
print(lr_pred.predict.sort_values(ascending = False).iloc[510:521])

In [None]:
# Transform predict by Threshold
cb_trans_reg_pred = np.array([0 if i < * else 1 for i in tune_cb_reg_pred])
xgb_trans_reg_pred = np.array([0 if i < * else 1 for i in tune_xgb_reg_pred])
lgbm_trans_reg_pred = np.array([0 if i < * else 1 for i in tune_lgbm_reg_pred])
et_trans_reg_pred = np.array([0 if i < * else 1 for i in tune_et_reg_pred])
lr_trans_reg_pred = np.array([0 if i < * else 1 for i in tune_lr_reg_pred])

## Saving Task1 Model

In [None]:
# Saving Model
joblib.dump(tune_cb_reg, 'model_save/tuning/task1_xxx_clf_task2_cb_reg.pkl')
joblib.dump(tune_xgb_reg, 'model_save/tuning/task1_xxx_clf_task2_xgb_reg.pkl')
joblib.dump(tune_lgbm_reg, 'model_save/tuning/task1_xxx_clf_task2_lgbm_reg.pkl')
joblib.dump(tune_rt_reg, 'model_save/tuning/task1_xxx_clf_task2_rt_reg.pkl')
joblib.dump(tune_lr_reg, 'model_save/tuning/task1_xxx_clf_task2_lr_reg.pkl')

## to_csv Submission

In [None]:
pd.DataFrame({'ID': test_id, 'Y_LABEL': cb_trans_reg_pred}).to_csv('predict/modeling/xx_clf_cb_reg_.csv', index=False)
pd.DataFrame({'ID': test_id, 'Y_LABEL': xgb_trans_reg_pred}).to_csv('predict/modeling/xx_clf_xgb_reg_.csv', index=False)
pd.DataFrame({'ID': test_id, 'Y_LABEL': lgbm_trans_reg_pred}).to_csv('predict/modeling/xx_clf_lgbm_reg_.csv', index=False)
pd.DataFrame({'ID': test_id, 'Y_LABEL': et_trans_reg_pred}).to_csv('predict/modeling/xx_clf_et_reg_.csv', index=False)
pd.DataFrame({'ID': test_id, 'Y_LABEL': lr_trans_reg_pred}).to_csv('predict/modeling/xx_clf_lr_reg_.csv', index=False)