# Import

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

import xgboost as xgb 
from xgboost import plot_importance , XGBClassifier

import lightgbm as lgbm
from lightgbm import LGBMClassifier

from sklearn.metrics import *
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

from tqdm import notebook
import time

# Read Data

In [2]:
train_original = pd.read_csv('./open data/train.csv')
test_original = pd.read_csv('./open data/test_x.csv')
train = train_original.copy()
test = test_original.copy()

# Data

In [3]:
# fill NA
def fill_married(data):
    pdata = data.copy()
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='10s'),'married' ] = 1
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='20s'),'married' ] = 1
    pdata.loc[pdata.married==0,'married'] = 2
    
    return pdata

def fill_education(data):
    pdata = data.copy()
    pdata.loc[(pdata.education==0)&(pdata.age_group=='10s'),'education'] = 2
    pdata.loc[pdata.education==0,'education'] = 3

    return pdata

def fill_engnat(data):
    pdata = data.copy()
    pdata.loc[pdata.engnat==0,'engnat'] = 1
    
    return pdata

def fill_hand(data):
    pdata = data.copy()
    pdata.loc[pdata.hand==0,'hand'] = 1
    
    return pdata
# feature engineering
def Mach_score(data):
    pdata = data.copy()
    Answers = []
    for i in range(20):
        Answers.append('Q'+chr(97+i)+'A')
    reverse_col = ['QeA','QfA','QkA','QqA','QrA','QaA','QdA','QgA','QiA','QnA']
    for col in reverse_col:
        pdata[col] = -pdata[col]
    pdata['Mach_score'] = pdata[Answers].sum(axis=1)
    
    return pdata

def w_score(data):
    pdata = data.copy()
    wr = []
    wf = []
    for i in range(1,14):
        wr.append(f'wr_{i:02d}')
    for i in range(1,4):
        wf.append(f'wf_{i:02d}')
    
    pdata['wr'] = pdata[wr].sum(axis=1)
    pdata['wf'] = pdata[wf].sum(axis=1)
    
    return pdata

def TIPI(data):
    pdata = data.copy()
    pdata['tp_score_1'] = pdata['tp01'] - pdata['tp06']
    pdata['tp_score_2'] = pdata['tp07'] - pdata['tp02']
    pdata['tp_score_3'] = pdata['tp03'] - pdata['tp08']
    pdata['tp_score_4'] = pdata['tp09'] - pdata['tp04']
    pdata['tp_score_5'] = pdata['tp05'] - pdata['tp10']
    
    return pdata

# drop outlier
def drop_outlier(data, datatype):
    
    assert datatype == 'train' or datatype=='test', 'Wrong data type given'
    
    pdata = data.copy()
    if datatype=='train':
        
        out_arr = []
        out_arr.append( data[data.familysize>=16].index )
        out_arr.append( data[data.wr<=3].index )
        out_arr.append( data[data.wf>=2].index )

        out = []
        for outarr in out_arr:
            out = np.union1d(out, outarr)

        pdata = data.drop(out)
    
    return pdata
# feature banding
def age_band(data):
    pdata = data.copy()
    pdata['age_group'].replace(['10s','20s','30s','40s','50s','60s','+70s'],[1,2,3,4,5,5,5],inplace=True)
    
    return pdata

def E_band(data, num_band):
    pdata = data.copy()
    for i in range(20):
        col = 'Q'+chr(i+97)+'E'
        pdata[col] = pd.qcut(pdata[col],num_band)
        unique = pdata[col].unique()
        pdata[col].replace(unique,range(num_band),inplace=True)
        
    return pdata

def family_band(data):
    pdata = data.copy()
    pdata.loc[pdata.familysize >= 6,'familysize'] = 6
    
    return pdata
# categorical value to numerical value
def cat_gender(data):
    feature = 'gender'
    pdata = data.copy()
    pdata[feature].replace(['Male','Female'],[0,1],inplace=True)
    
    return pdata

def cat_race(data):
    feature = 'race'
    pdata = data.copy()
    unique = ['White', 'Asian', 'Other', 'Black', 'Native American', 'Arab', 'Indigenous Australian']
    pdata[feature].replace(unique,[0,1,2,3,4,5,6],inplace=True)
    
    return pdata

def cat_religion(data):
    feature = 'religion'
    pdata = data.copy()
    unique = ['Other', 'Hindu', 'Agnostic', 'Atheist', 'Christian_Other',
       'Christian_Catholic', 'Muslim', 'Buddhist', 'Christian_Protestant',
       'Jewish', 'Christian_Mormon', 'Sikh']
    pdata[feature].replace(unique,[11,10,0,1,2,3,4,5,6,7,8,9],inplace=True)
    
    return pdata

def cat_num(data):
    pdata = data.copy()
    pdata = cat_gender(pdata)
    pdata = cat_race(pdata)
    pdata = cat_religion(pdata)
    
    return pdata
# drop feature
def drop_feature(data, feature_arr):
    arr = feature_arr + ['index'] 
    
    """for i in range(20):
        arr.append('Q'+chr(i+97)+'A')
    for i in range(20):
        arr.append('Q'+chr(i+97)+'E')
    for i in range(1,14):
        arr.append(f'wr_{i:02d}')
    for i in range(1,4):
        arr.append(f'wf_{i:02d}')
    for i in range(1,11):
        arr.append(f'tp{i:02d}')"""
    

    pdata = data.drop(arr,axis=1)
    
    return pdata


In [4]:
def preprocess(data, datatype, feature_arr):
    
    pdata = data.copy()
    # fill NA
    pdata = fill_married(pdata)
    pdata = fill_education(pdata)
    pdata = fill_engnat(pdata)
    pdata = fill_hand(pdata)
    # feature engineering
    pdata = Mach_score(pdata)
    pdata = w_score(pdata)
    pdata = TIPI(pdata)
    # drop outlier
    pdata = drop_outlier(pdata,datatype)
    # feature banding
    pdata = age_band(pdata)
    pdata = family_band(pdata)
    pdata = E_band(pdata,10)
    # categorical value to numerical value
    pdata = cat_num(pdata)
    # drop feature
    pdata = drop_feature(pdata, feature_arr)
    # unify type of data
    pdata = pdata.astype(np.int)
    
    return pdata

# XGBoost

In [5]:
opt = { 'max_depth' : 8,
        'n_estimators' : 200,
        'learning_rate' : 0.010,
         'min_child_weight' : 6,
         'colsample_bytree' : 0.8,
        'verbosity' : 0,
        'objective' : 'binary:logistic',
        'booster' : 'gbtree',
        'subsample' : 0.8}

In [6]:
opt_arr_1 = ['QbA','QcA','QjA','QhA','QmA','QtA','QlA','QkA','QoA',
 'QsA','QrA', 'QeA',  'QnA','QgA', 'QdA'] + \
['QaE','QbE','QcE','QeE','QgE','QmE','QfE','QiE','QoE',
 'QlE','QrE','QpE','QnE','QtE','QsE','QkE'] + \
['tp01','tp04','tp05','tp09','tp10','tp02', 'tp08'] +\
['wf_01', 'wf_03'] + \
['wr_06','wr_09', 'wr_11','wr_07', 'wr_12','wr_13'] +\
['tp_score_4','hand']

opt_arr_2 = ['QjA','QaE', 'QbA', 'QeE', 'QfE', 'QhA', 'QiE', 'tp09', 'tp_score_4',
      'QbE', 'QtA', 'tp01', 'tp_score_2',
       'QmA', 'QmE', 'tp04',
       'QgE', 'QkA', 'QoE', 'QsA',
       'QlE', 'QoE', 'QrE', 'wf_03',
       'QoA', 'QlA', 'QsE', 'tp10',
       'QpE', 'tp08', 'wf_01',
       'QkE', 'QrA', 'wr_05', 'wr_09', 'wr_10', 'wr_11',
       'QgA', 'QtE', 'hand', 'tp06', 'QeA', 'wr_06', 'wr_12',
       'wr_03', 'wr_07', 'QdA', 'QdE', 'QnE', 'wr_13'
      ]

# XGB CV

In [7]:
def cross_validation(param, arr, seed):
    
    if seed==False:
        train_data, val_data = train_test_split(train, test_size=0.14998, 
                                                shuffle=False)
    else:
        train_data, val_data = train_test_split(train, test_size=0.14998, 
                                                random_state=seed, shuffle=True)
        
    train_x = preprocess(train_data,'train',arr)
    train_y = train_x['voted']
    train_x = train_x.drop(['voted'],axis=1)
    val_x = preprocess(val_data,'test',arr)
    val_y = val_x['voted']
    val_x = val_x.drop(['voted'],axis=1)
    
    start = time.time()
    model = XGBClassifier(**param)
    model.fit(train_x,train_y,verbose=False)
    print(f'Training time : {time.time()-start}s')
    
    pred = model.predict_proba(val_x)[:,1]
    val_auc = roc_auc_score(val_y, pred)
    
    print(f'Validation auc : {val_auc:.6f}')
    
    return val_auc

In [11]:
auc1 = cross_validation(opt_gpu, [], 42)

Training time : 37.05744123458862s
Validation auc : 0.771314


In [12]:
auc1 = cross_validation(opt, opt_arr_1, 42)

Training time : 2.6593611240386963s
Validation auc : 0.771421


In [27]:
def kfold_cv(param, arr, NFOLD):
    
    train_x = train.copy()
    train_y = train_x['voted']
    train_x = train_x.drop(['voted'],axis=1)
    
    train_copy = train.copy()
    
    kfold = KFold(n_splits = NFOLD, shuffle=True, random_state=0)
    model_arr = []
    auc_arr = []
    
    for i, (train_index, val_index) in notebook.tqdm(enumerate(kfold.split(train_copy)), total=kfold.get_n_splits()):
        if i in [0,2,6]:
            train_x, val_x = train_copy.iloc[train_index,:], train_copy.iloc[val_index,:]

            train_x = preprocess(train_x, 'train', arr)
            train_y = train_x['voted']
            train_x = train_x.drop(['voted'],axis=1)

            val_x = preprocess(val_x, 'test', arr)
            val_y = val_x['voted']
            val_x = val_x.drop(['voted'],axis=1)

            start = time.time()
            model = XGBClassifier(**param)
            model.fit(train_x,train_y,verbose=False)
            print(f'Training time : {time.time()-start:.2f}s')
            model_arr.append(model)
    
            pred = model.predict_proba(val_x)[:,1]
            val_auc = roc_auc_score(val_y, pred)
            auc_arr.append(val_auc)
            print(f'{val_auc:.6f}')
        
    
    print(f'{sum(auc_arr)/len(auc_arr):.6f}')
    
    return model_arr

In [33]:
opt_gpu = { 'max_depth' : 9,
        'n_estimators' : 200,
        'learning_rate' : 0.010,
         'min_child_weight' : 6,
         'colsample_bytree' : 0.8,
        'verbosity' : 0,
        'objective' : 'binary:logistic',
        'booster' : 'gbtree',
        'subsample' : 0.8,
        'tree_method' : 'gpu_hist'
        # ,'predictor' : 'gpu_predictor'
       }
past_gpu = { 'max_depth' : 8,
        'n_estimators' : 200,
        'learning_rate' : 0.010,
         'min_child_weight' : 6,
         'colsample_bytree' : 0.8,
        'verbosity' : 0,
        'objective' : 'binary:logistic',
        'booster' : 'gbtree',
        'subsample' : 0.8,
        'tree_method' : 'gpu_hist'
        # ,'predictor' : 'gpu_predictor'
       }
test_gpu = { 'max_depth' : 10,
        'n_estimators' : 200,
        'learning_rate' : 0.010,
         'min_child_weight' : 6,
         'colsample_bytree' : 0.8,
        'verbosity' : 0,
        'objective' : 'binary:logistic',
        'booster' : 'gbtree',
        'subsample' : 0.8,
        'tree_method' : 'gpu_hist'
        # ,'predictor' : 'gpu_predictor'
       }

In [30]:
model_list1 = kfold_cv(opt_gpu, [], 7)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))

Training time : 57.56s
0.772166
Training time : 57.01s
0.766914
Training time : 57.73s
0.777601

0.772227


In [29]:
model_list2 = kfold_cv(past_gpu, [], 7)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))

Training time : 37.05s
0.772120
Training time : 37.24s
0.765992
Training time : 38.12s
0.777169

0.771760


In [32]:
model_list3 = kfold_cv(test_gpu, [], 7)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))

Training time : 80.75s
0.772206
Training time : 82.34s
0.766648
Training time : 81.10s
0.777216

0.772023


In [34]:
model_list3 = kfold_cv(test_gpu, [], 7)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))

Training time : 82.89s
0.771729
Training time : 85.16s
0.766839
Training time : 86.03s
0.776597

0.771721
