# Import

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, KFold, StratifiedKFold
from sklearn.metrics import *
import itertools

import xgboost as xgb 
from xgboost import plot_importance , XGBClassifier

import lightgbm as lgbm
from lightgbm import LGBMClassifier

from tqdm import tqdm, notebook
import time

from bayes_opt import BayesianOptimization

# Read Data

In [2]:
train_original = pd.read_csv('./open data/train.csv')
test_original = pd.read_csv('./open data/test_x.csv')
train = train_original.copy()
test = test_original.copy()

# Data

In [3]:
# fill NA
def fill_married(data):
    pdata = data.copy()
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='10s'),'married' ] = 1
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='20s'),'married' ] = 1
    pdata.loc[pdata.married==0,'married'] = 2
    
    return pdata

def fill_education(data):
    pdata = data.copy()
    pdata.loc[(pdata.education==0)&(pdata.age_group=='10s'),'education'] = 2
    pdata.loc[pdata.education==0,'education'] = 3

    return pdata

def fill_engnat(data):
    pdata = data.copy()
    pdata.loc[pdata.engnat==0,'engnat'] = 1
    
    return pdata

def fill_hand(data):
    pdata = data.copy()
    pdata.loc[pdata.hand==0,'hand'] = 1
    
    return pdata
# feature engineering
def Mach_score(data):
    pdata = data.copy()
    Answers = []
    for i in range(20):
        Answers.append('Q'+chr(97+i)+'A')
    reverse_col = ['QeA','QfA','QkA','QqA','QrA','QaA','QdA','QgA','QiA','QnA']
    for col in reverse_col:
        pdata[col] = -pdata[col]
    pdata['Mach_score'] = pdata[Answers].sum(axis=1)
    
    return pdata

def w_score(data):
    pdata = data.copy()
    wr = []
    wf = []
    for i in range(1,14):
        wr.append(f'wr_{i:02d}')
    for i in range(1,4):
        wf.append(f'wf_{i:02d}')
    
    pdata['wr'] = pdata[wr].sum(axis=1)
    pdata['wf'] = pdata[wf].sum(axis=1)
    
    return pdata

def TIPI(data):
    pdata = data.copy()
    pdata['tp_score_1'] = pdata['tp01'] - pdata['tp06']
    pdata['tp_score_2'] = pdata['tp07'] - pdata['tp02']
    pdata['tp_score_3'] = pdata['tp03'] - pdata['tp08']
    pdata['tp_score_4'] = pdata['tp09'] - pdata['tp04']
    pdata['tp_score_5'] = pdata['tp05'] - pdata['tp10']
    
    return pdata

# drop outlier
def drop_outlier(data, datatype):
    
    assert datatype == 'train' or datatype=='test', 'Wrong data type given'
    
    pdata = data.copy()
    if datatype=='train':
        
        out_arr = []
        out_arr.append( data[data.familysize>=16].index )
        out_arr.append( data[data.wr<=3].index )
        out_arr.append( data[data.wf>=2].index )

        out = []
        for outarr in out_arr:
            out = np.union1d(out, outarr)

        pdata = data.drop(out)
    
    return pdata
# feature banding
def age_band(data):
    pdata = data.copy()
    pdata['age_group'].replace(['10s','20s','30s','40s','50s','60s','+70s'],[1,2,3,4,5,5,5],inplace=True)
    
    return pdata

def E_band(data, num_band):
    pdata = data.copy()
    for i in range(20):
        col = 'Q'+chr(i+97)+'E'
        pdata[col] = pd.qcut(pdata[col],num_band)
        unique = pdata[col].unique()
        pdata[col].replace(unique,range(num_band),inplace=True)
        
    return pdata

def family_band(data):
    pdata = data.copy()
    pdata.loc[pdata.familysize >= 6,'familysize'] = 6
    
    return pdata
# categorical value to numerical value
def cat_gender(data):
    feature = 'gender'
    pdata = data.copy()
    pdata[feature].replace(['Male','Female'],[0,1],inplace=True)
    
    return pdata

def cat_race(data):
    feature = 'race'
    pdata = data.copy()
    unique = ['White', 'Asian', 'Other', 'Black', 'Native American', 'Arab', 'Indigenous Australian']
    pdata[feature].replace(unique,[0,1,2,3,4,5,6],inplace=True)
    
    return pdata

def cat_religion(data):
    feature = 'religion'
    pdata = data.copy()
    unique = ['Other', 'Hindu', 'Agnostic', 'Atheist', 'Christian_Other',
       'Christian_Catholic', 'Muslim', 'Buddhist', 'Christian_Protestant',
       'Jewish', 'Christian_Mormon', 'Sikh']
    pdata[feature].replace(unique,[11,10,0,1,2,3,4,5,6,7,8,9],inplace=True)
    
    return pdata

def cat_num(data):
    pdata = data.copy()
    pdata = cat_gender(pdata)
    pdata = cat_race(pdata)
    pdata = cat_religion(pdata)
    
    return pdata
# drop feature
def drop_feature(data, feature_arr):
    arr = feature_arr + ['index'] 
    
    """for i in range(20):
        arr.append('Q'+chr(i+97)+'A')
    for i in range(20):
        arr.append('Q'+chr(i+97)+'E')
    for i in range(1,14):
        arr.append(f'wr_{i:02d}')
    for i in range(1,4):
        arr.append(f'wf_{i:02d}')
    for i in range(1,11):
        arr.append(f'tp{i:02d}')"""
    

    pdata = data.drop(arr,axis=1)
    
    return pdata


In [4]:
def preprocess(data, datatype, feature_arr):
    
    pdata = data.copy()
    # fill NA
    pdata = fill_married(pdata)
    pdata = fill_education(pdata)
    pdata = fill_engnat(pdata)
    pdata = fill_hand(pdata)
    # feature engineering
    pdata = Mach_score(pdata)
    pdata = w_score(pdata)
    pdata = TIPI(pdata)
    # drop outlier
    pdata = drop_outlier(pdata,datatype)
    # feature banding
    pdata = age_band(pdata)
    pdata = family_band(pdata)
    pdata = E_band(pdata,10)
    # categorical value to numerical value
    pdata = cat_num(pdata)
    # drop feature
    pdata = drop_feature(pdata, feature_arr)
    # unify type of data
    pdata = pdata.astype(np.int)
    
    return pdata

# LGBM BO

In [32]:
def lgbm_cv(max_depth, num_leaves, n_estimators, learning_rate, weight, col_sample, feature_frac, bagging_frac, l1, l2):
    param = { 'max_depth' : int(round(max_depth)), 
            'num_leaves' : int(round(num_leaves)),
            'n_estimators' : int(round(n_estimators)),
            'learning_rate' : learning_rate,
            'min_child_weight' : int(round(weight)),
            'colsample_bytree' : col_sample,
            'feature_fraction' : max(min(feature_frac,1),0),
            'bagging_fraction' : max(min(bagging_frac,1),0),
            'lambda_l1' : max(l1,0),
            'lambda_l2' : max(l2,0),
            'objective' : 'binary',
            'boosting_type' : 'dart'
            }
    
    model = LGBMClassifier(**param)
    scoring = {'roc_auc_score' : make_scorer(roc_auc_score)}
    result = cross_validate(model, train_x, train_y, cv=6, scoring=scoring)
    auc_score = result['test_roc_auc_score'].mean()
    
    return auc_score

In [33]:
pbounds = {'max_depth' : (2,50), 
            'num_leaves' : (100,1000),
            'n_estimators' : (100,1000),
            'learning_rate' : (0.005,0.02),
            'weight' : (10,200),
            'col_sample' : (0,0.99),
            'feature_frac' : (0.0001,0.99),
            'bagging_frac' : (0.0001,0.99),
            'l1' : (0,0.99),
            'l2' : (0,0.99)}

In [34]:
lgbmB = BayesianOptimization(f=lgbm_cv, pbounds=pbounds, verbose=2, random_state=0)

In [35]:
lgbmB.maximize(init_points=10, n_iter=25, acq='ei', xi=0.01)

|   iter    |  target   | baggin... | col_sa... | featur... |    l1     |    l2     | learni... | max_depth | n_esti... | num_le... |  weight   |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6973  [0m | [0m 0.5434  [0m | [0m 0.708   [0m | [0m 0.5968  [0m | [0m 0.5394  [0m | [0m 0.4194  [0m | [0m 0.01469 [0m | [0m 23.0    [0m | [0m 902.6   [0m | [0m 967.3   [0m | [0m 82.85   [0m |
| [0m 2       [0m | [0m 0.6903  [0m | [0m 0.7838  [0m | [0m 0.5236  [0m | [0m 0.5624  [0m | [0m 0.9163  [0m | [0m 0.07033 [0m | [0m 0.006307[0m | [0m 2.97    [0m | [0m 849.4   [0m | [0m 800.3   [0m | [0m 175.3   [0m |


KeyboardInterrupt: 

In [None]:
lgbmB.max

# XGB BO

In [5]:
class my_XGBClassifier(XGBClassifier):
    def predict(self, X):
        return XGBClassifier.predict_proba(self, X)[:,1]  

In [36]:
def xgb_cv(max_depth, 
           n_estimators, learning_rate, weight, 
           #col_sample, subsample, 
           reg_l, reg_a):
    param = { 'max_depth' : 8, #int(round(max_depth)), 
            'n_estimators' : int(round(n_estimators)),
            'learning_rate' : learning_rate,
            'min_child_weight' : int(round(weight)),
            'colsample_bytree' : 0.8, #col_sample,
            'subsample' : 0.8, #subsample,
            'reg_lambda' : max(reg_l,0),
            'reg_alpha' : max(reg_a,0),
            'verbosity' : 0,
            'objective' : 'binary:logistic',
            'booster' : 'gbtree', 
            #'tree_method' : 'gpu_hist'
            }
    train_x = preprocess(train,'train',BAD)
    train_y = train_x['voted']
    train_x = train_x.drop(['voted'],axis=1)
    
    model = my_XGBClassifier(**param)
    scoring = {'roc_auc_score' : make_scorer(roc_auc_score)}
    NFOLD = 5
    kfold = StratifiedKFold(n_splits = NFOLD, shuffle=True, random_state=42)
    result = cross_validate(model, train_x, train_y, cv=kfold, scoring=scoring)
    auc_score = result['test_roc_auc_score'].mean()
    
    return auc_score

In [37]:
BAD = []
pbounds = {'max_depth' : (7.5,10.5), 
            'n_estimators' : (150,550),
            'learning_rate' : (0.005,0.025),
            'weight' : (0,20),
            #'col_sample' : (0.5,0.99),
            #'subsample' : (0.5,0.99),
            'reg_l' : (0,0.99),
            'reg_a' : (0,0.99)}

In [38]:
xgbB = BayesianOptimization(f=xgb_cv, pbounds=pbounds, verbose=2, random_state=42)

In [31]:
xgbB.maximize(init_points=3, n_iter=5, acq='ei', xi=0.01)

|   iter    |  target   | learni... | max_depth | n_esti... |   reg_a   |   reg_l   |  weight   |
-------------------------------------------------------------------------------------------------


TypeError: '>' not supported between instances of 'dict' and 'dict'

In [None]:
xgbB.max