### Import Module

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc('font',family='Malgun Gothic')

import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.preprocessing import PowerTransformer

from sklearn.feature_selection import SelectPercentile

import klib

import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean

from sklearn.base import ClassifierMixin
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('/Users/jody/Downloads/code/11_15/data.csv' , encoding='cp949')
cust_tr = data.CLNT_ID

data_te = pd.read_csv('/Users/jody/Downloads/code/11_15/data_te.csv' , encoding='cp949')
cust_te = data_te.CLNT_ID 

In [3]:
X_train = pd.read_csv('/Users/jody/Downloads/code/11_15/train.csv' , encoding='cp949').drop(columns='Unnamed: 0', axis=1)
X_test = pd.read_csv('/Users/jody/Downloads/code/11_15/test.csv' , encoding='cp949').drop(columns='Unnamed: 0', axis=1)

y_target =  pd.read_csv('y_train.csv').LABEL

In [4]:
klib.missingval_plot(X_train)

No missing values found in the dataset.


In [5]:
klib.missingval_plot(X_test)

No missing values found in the dataset.


### Data Split

In [6]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_target, test_size=0.3, stratify = y_target, random_state=0)

In [7]:
X_train.shape, X_dev.shape, y_train.shape, y_dev.shape

((105000, 581), (45000, 581), (105000,), (45000,))

In [24]:
seed = 0
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
models = []

## Modeling

### RandomForest

In [25]:
pbounds = { 'n_estimators': (100,500),
            'max_depth': (10,31), 
            'max_features': (0.8,0.99),
            'min_samples_leaf': (1, 10)}

def rf_opt(n_estimators, max_depth, max_features, min_samples_leaf):
    
    params = {
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'min_samples_leaf' : int(round(min_samples_leaf))
    }

    rf = RandomForestClassifier(**params, n_jobs=-1, random_state=seed)
    score = cross_val_score(rf, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)
    
    return np.mean(score)


BO_rf = BayesianOptimization(f = rf_opt, pbounds = pbounds, random_state=seed)

In [26]:
BO_rf.maximize(init_points=5, n_iter=10)

|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.292   [0m | [0m 21.53   [0m | [0m 0.9359  [0m | [0m 6.425   [0m | [0m 318.0   [0m |
| [95m 2       [0m | [95m-1.29    [0m | [95m 18.9    [0m | [95m 0.9227  [0m | [95m 4.938   [0m | [95m 456.7   [0m |
| [0m 3       [0m | [0m-1.295   [0m | [0m 30.24   [0m | [0m 0.8729  [0m | [0m 8.126   [0m | [0m 311.6   [0m |
| [0m 4       [0m | [0m-1.306   [0m | [0m 21.93   [0m | [0m 0.9759  [0m | [0m 1.639   [0m | [0m 134.9   [0m |
| [0m 5       [0m | [0m-1.301   [0m | [0m 10.42   [0m | [0m 0.9582  [0m | [0m 8.003   [0m | [0m 448.0   [0m |
| [0m 6       [0m | [0m-1.293   [0m | [0m 19.58   [0m | [0m 0.8425  [0m | [0m 2.639   [0m | [0m 458.4   [0m |
| [95m 7       [0m | [95m-1.289   [0m | [95m 22.57   [0m | [95m 0.9072  [0m | [95m 9.188   [0m | [95m 4

In [27]:
BO_rf.max

{'target': -1.2869782559214826,
 'params': {'max_depth': 17.49706140199854,
  'max_features': 0.8344093325309263,
  'min_samples_leaf': 9.95841332528359,
  'n_estimators': 460.37562757702636}}

In [28]:
max_params = BO_rf.max['params']

max_params['n_estimators'] = int(round(max_params['n_estimators']))
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params['min_samples_leaf'] = int(round(max_params['min_samples_leaf']))

max_params

{'max_depth': 17,
 'max_features': 0.8344093325309263,
 'min_samples_leaf': 10,
 'n_estimators': 460}

In [29]:
rf_clf = RandomForestClassifier(**max_params,  n_jobs=-1, random_state=seed)

scores = cross_val_score(rf_clf, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

models.append(rf_clf)

[-1.28713265 -1.28221271 -1.28933151 -1.28417528 -1.28511298]
최대성능: -1.2822127099628977
평균성능: -1.2855930254235948


### ExtraTrees

In [30]:
pbounds = { 'n_estimators': (50, 500),
            'max_depth': (10,35),    
            'max_features': (0.8,0.95),
            'min_samples_split': (2, 5),
            'min_samples_leaf': (5, 10)
            }

def extra_opt(n_estimators, max_depth, max_features, min_samples_split, min_samples_leaf):

    params = {
        'n_estimators': int(round(n_estimators)),
        'max_depth': int(round(max_depth)),
        'max_features' : max_features,
        'min_samples_leaf': int(round(min_samples_leaf)),
        'min_samples_split': int(round(min_samples_split)),
        'n_jobs' : -1
    }
    
    extra = ExtraTreesClassifier(bootstrap = True, oob_score=True, **params)
    score = cross_val_score(extra, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)
    
    return np.mean(score)

BO_extra = BayesianOptimization(f = extra_opt, pbounds = pbounds, random_state=seed) 

In [None]:
BO_extra.maximize(init_points=5, n_iter=10)

In [None]:
BO_extra.max

In [None]:
max_params = BO_extra.max['params']

max_params['n_estimators'] = int(round(max_params['n_estimators']))
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params['min_samples_leaf'] = int(round(max_params['min_samples_leaf']))
max_params['min_samples_split'] = int(round(max_params['min_samples_split']))

max_params

In [None]:
extra_clf = ExtraTreesClassifier(bootstrap = True, oob_score=True, **max_params)

scores = cross_val_score(extra_clf, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

models.append(extra_clf)

### Lgbm

In [None]:
pbounds = { 'learning_rate': (0.01, 0.1),
            'n_estimators': (100, 1000),
            'max_depth': (11,35),   
            'subsample': (0.7,0.9), 
            'colsample_bytree': (0.7,0.9),   
            'num_leaves': (8,128,256,512),
            'min_child_weight': (1, 7)
          }


def lgbm_opt(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, num_leaves, min_child_weight):

    params = {
        'learning_rate': learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'subsample': subsample,
        'colsample_bytree' : colsample_bytree,
        'num_leaves' : int(round(num_leaves)),
        'min_child_weight' : min_child_weight,
        'n_jobs' : -1
    }
    
    lgbm = LGBMClassifier(**params, random_state=seed)
    
    skf = StratifiedKFold(n_splits=4 , shuffle=True, random_state=seed)

    score = cross_val_score(lgbm, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)
    
    return np.mean(score)

BO_lgbm = BayesianOptimization(f = lgbm_opt, pbounds = pbounds, random_state=seed) 

In [None]:
BO_lgbm.maximize(init_points=5, n_iter=15)

In [None]:
BO_lgbm.max

In [None]:
max_params = BO_lgbm.max['params']

max_params['n_estimators'] = int(round(max_params['n_estimators']))
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params['num_leaves'] = int(round(max_params['num_leaves']))

max_params

In [None]:
lgbm_clf = LGBMClassifier(**max_params, random_state=seed)

scores = cross_val_score(lgbm_clf, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

models.append(lgbm_clf)

### Logistic Regression

In [None]:
pbounds = { 'C': (0.01,1)}

def lr_opt(C):    
    params = {'C' : C}
    lr= LogisticRegression(**params, n_jobs=-1, random_state=seed)      
    score = cross_val_score(lr, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)    
    return np.mean(score)

BO_lr = BayesianOptimization(f = lr_opt, pbounds = pbounds, random_state=seed)

In [None]:
BO_lr.maximize(init_points=5, n_iter=10)

In [None]:
BO_lr.max

In [None]:
max_params = BO_lr.max['params']

max_params

In [None]:
lr_clf = LogisticRegression(**max_params,  n_jobs=-1, random_state=seed)
scores = cross_val_score(lr_clf, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

models.append(lr_clf)

### Multi Layer Perceptron

In [None]:
pbounds = { 'batch_size' :(50,200),
          'learning_rate_init':(0.0001,0.001)}

def mlp_opt(batch_size,learning_rate_init):

    params = {
        'batch_size' : int(round(batch_size)),
        'learning_rate' : ('adaptive'),
         'learning_rate_init':learning_rate_init,
         'activation': ('logistic'),
         'max_iter' : (10000),
         'power_t' :(0.5),
         'solver': ('adam'),
        'alpha': (0.0001),
        'hidden_layer_sizes':(2)

    }
    
    mlp = MLPClassifier(**params,random_state=seed)
    score = cross_val_score(mlp, X_train, y_train, scoring='neg_log_loss', cv=skf,n_jobs=-1)
    
    return np.mean(score)

BO_mlp= BayesianOptimization(f = mlp_opt, pbounds = pbounds, random_state=seed)  

In [None]:
BO_mlp.maximize(init_points=5, n_iter=15)

In [None]:
BO_mlp.max

In [None]:
max_params = BO_mlp.max['params']

max_params['batch_size'] = int(round(max_params['batch_size']))

In [None]:
mlp_clf = MLPClassifier(**max_params,learning_rate='adaptive',
         activation= 'logistic',
         max_iter = 10000,
         power_t =0.5,
         solver='adam',
        alpha=0.0001,
        hidden_layer_sizes=2)

scores = cross_val_score(mlp_clf, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

models.append(mlp_clf)

### XGB

In [22]:
pbounds = { 'learning_rate': (0.05, 1.5),
            'n_estimators': (100,300),
            'max_depth': (5,15),   
            'subsample': (0.8,0.95),  
            'colsample': (0.75,0.95),   
            'gamma': (0, 5)}

def xgb_opt(learning_rate, n_estimators, max_depth, subsample, colsample, gamma):
    
    params = {
        'learning_rate': learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'subsample': subsample,
        'colsample': colsample,   
        'gamma': gamma,
        'n_jobs' : -1
    }
    
    xgb = XGBClassifier(**params) 
    skf = StratifiedKFold(n_splits=5 , shuffle=True, random_state=seed)
    score = cross_val_score(xgb, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)
    return np.mean(score)
BO_xgb = BayesianOptimization(f = xgb_opt, pbounds = pbounds, random_state=seed)

In [23]:
BO_xgb.maximize(init_points=5, n_iter=10)

|   iter    |  target   | colsample |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.569   [0m | [0m 0.8598  [0m | [0m 3.576   [0m | [0m 0.924   [0m | [0m 10.45   [0m | [0m 184.7   [0m | [0m 0.8969  [0m |
| [0m 2       [0m | [0m-1.897   [0m | [0m 0.8375  [0m | [0m 4.459   [0m | [0m 1.447   [0m | [0m 8.834   [0m | [0m 258.3   [0m | [0m 0.8793  [0m |
| [95m 3       [0m | [95m-1.259   [0m | [95m 0.8636  [0m | [95m 4.628   [0m | [95m 0.153   [0m | [95m 5.871   [0m | [95m 104.0   [0m | [95m 0.9249  [0m |
| [0m 4       [0m | [0m-1.977   [0m | [0m 0.9056  [0m | [0m 4.35    [0m | [0m 1.469   [0m | [0m 12.99   [0m | [0m 192.3   [0m | [0m 0.9171  [0m |


KeyboardInterrupt: 

In [16]:
BO_xgb.max

{}

In [None]:
max_params = BO_xgb.max['params']

max_params['n_estimators'] = int(round(max_params['n_estimators']))
max_params['max_depth'] = int(round(max_params['max_depth']))

max_params

In [None]:
xgb_clf = XGBClassifier(**max_params)

scores = cross_val_score(xgb_clf, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

models.append(xgb_clf)

### SVC

In [None]:
pbounds = { 'C':(0.1,10),
          'gamma':(0.1,10)}


def svm_opt(C,gamma):
    params = {'C' : C,
              'gamma' : gamma}
    
    svm = SVC(**params)
    skf = StratifiedKFold(n_splits=5 , shuffle=False, random_state=seed)
    score = cross_val_score(svm, X_train, y_train, scoring='neg_log_loss', cv=skf)
    return np.mean(score)

BO_svm = BayesianOptimization(f = svm_opt, pbounds = pbounds, random_state=seed)

In [None]:
BO_svm.maximize(init_points=5, n_iter=10)

In [None]:
BO_svm.max
max_params = BO_svm.max['params']

max_params

In [None]:
svm_clf = SVC(**max_params)
scores = cross_val_score(svm_clf, X_train, y_train, scoring='neg_log_loss', cv=skf)
print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

models.append(svm_clf)

### Voting

In [None]:
estimators = [(model.__class__.__name__, model) for model in models]
estimators

### Soft voting

In [39]:
voting_clf = VotingClassifier(estimators = estimators, voting='soft', n_jobs=-1)
voting_clf.fit(X_train, y_train)
pred = voting_clf.predict_proba(X_dev)
test_score = log_loss(y_dev, pred)

print(test_score)

1.2571629708649552


In [42]:
pred = pd.DataFrame(voting_clf.predict_proba(X_test))

result = pd.concat([cust_te , pred], axis =1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('voting_1115.csv',index=False)

### Layer Stacking

In [40]:
stk_layer2 = StackingClassifier(estimators = [('lr', lr_clf), ('lgb', lgbm_clf)],
                            final_estimator = lr_clf, cv=skf, n_jobs=-1)

stk_clf = StackingClassifier(estimators = [('extra', extra_clf), ('rf', rf_clf)],
                            final_estimator = stk_layer2, cv=skf, n_jobs=-1)

stk_clf.fit(X_train, y_train)
pred = stk_clf.predict_proba(X_dev)

test_score = log_loss(y_dev, pred)
print(test_score)

1.2779105960425103


In [41]:
pred = pd.DataFrame(stk_clf.predict_proba(X_test))

result = pd.concat([cust_te , pred], axis =1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('1115_s.csv',index=False)

### END