# Import

In [4]:
import pandas as pd
from pandas import DataFrame
import numpy as np

import xgboost as xgb 
from xgboost import plot_importance , XGBClassifier

import lightgbm as lgbm
from lightgbm import LGBMClassifier

# Read Data

In [5]:
train_original = pd.read_csv('./open data/train.csv')
test_original = pd.read_csv('./open data/test_x.csv')
train = train_original.copy()
test = test_original.copy()

# Data

In [3]:
# fill NA
def fill_married(data):
    pdata = data.copy()
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='10s'),'married' ] = 1
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='20s'),'married' ] = 1
    pdata.loc[pdata.married==0,'married'] = 2
    
    return pdata

def fill_education(data):
    pdata = data.copy()
    pdata.loc[(pdata.education==0)&(pdata.age_group=='10s'),'education'] = 2
    pdata.loc[pdata.education==0,'education'] = 3

    return pdata

def fill_engnat(data):
    pdata = data.copy()
    pdata.loc[pdata.engnat==0,'engnat'] = 1
    
    return pdata

def fill_hand(data):
    pdata = data.copy()
    pdata.loc[pdata.hand==0,'hand'] = 1
    
    return pdata
# feature engineering
def Mach_score(data):
    pdata = data.copy()
    Answers = []
    for i in range(20):
        Answers.append('Q'+chr(97+i)+'A')
    reverse_col = ['QeA','QfA','QkA','QqA','QrA','QaA','QdA','QgA','QiA','QnA']
    for col in reverse_col:
        pdata[col] = -pdata[col]
    pdata['Mach_score'] = pdata[Answers].sum(axis=1)
    
    return pdata

def w_score(data):
    pdata = data.copy()
    wr = []
    wf = []
    for i in range(1,14):
        wr.append(f'wr_{i:02d}')
    for i in range(1,4):
        wf.append(f'wf_{i:02d}')
    
    pdata['wr'] = pdata[wr].sum(axis=1)
    pdata['wf'] = pdata[wf].sum(axis=1)
    
    return pdata

def TIPI(data):
    pdata = data.copy()
    pdata['tp_score_1'] = pdata['tp01'] - pdata['tp06']
    pdata['tp_score_2'] = pdata['tp07'] - pdata['tp02']
    pdata['tp_score_3'] = pdata['tp03'] - pdata['tp08']
    pdata['tp_score_4'] = pdata['tp09'] - pdata['tp04']
    pdata['tp_score_5'] = pdata['tp05'] - pdata['tp10']
    
    return pdata

# drop outlier
def drop_outlier(data, datatype):
    
    assert datatype == 'train' or datatype=='test', 'Wrong data type given'
    
    pdata = data.copy()
    if datatype=='train':
        
        out_arr = []
        out_arr.append( data[data.familysize>=16].index )
        out_arr.append( data[data.wr<=3].index )
        out_arr.append( data[data.wf>=2].index )

        out = []
        for outarr in out_arr:
            out = np.union1d(out, outarr)

        pdata = data.drop(out)
    
    return pdata
# feature banding
def age_band(data):
    pdata = data.copy()
    pdata['age_group'].replace(['10s','20s','30s','40s','50s','60s','+70s'],[1,2,3,4,5,5,5],inplace=True)
    
    return pdata

def E_band(data, num_band):
    pdata = data.copy()
    for i in range(20):
        col = 'Q'+chr(i+97)+'E'
        pdata[col] = pd.qcut(pdata[col],num_band)
        unique = pdata[col].unique()
        pdata[col].replace(unique,range(num_band),inplace=True)
        
    return pdata

def family_band(data):
    pdata = data.copy()
    pdata.loc[pdata.familysize >= 6,'familysize'] = 6
    
    return pdata
# categorical value to numerical value
def cat_gender(data):
    feature = 'gender'
    pdata = data.copy()
    pdata[feature].replace(['Male','Female'],[0,1],inplace=True)
    
    return pdata

def cat_race(data):
    feature = 'race'
    pdata = data.copy()
    unique = ['White', 'Asian', 'Other', 'Black', 'Native American', 'Arab', 'Indigenous Australian']
    pdata[feature].replace(unique,[0,1,2,3,4,5,6],inplace=True)
    
    return pdata

def cat_religion(data):
    feature = 'religion'
    pdata = data.copy()
    unique = ['Other', 'Hindu', 'Agnostic', 'Atheist', 'Christian_Other',
       'Christian_Catholic', 'Muslim', 'Buddhist', 'Christian_Protestant',
       'Jewish', 'Christian_Mormon', 'Sikh']
    pdata[feature].replace(unique,[11,10,0,1,2,3,4,5,6,7,8,9],inplace=True)
    
    return pdata

def cat_num(data):
    pdata = data.copy()
    pdata = cat_gender(pdata)
    pdata = cat_race(pdata)
    pdata = cat_religion(pdata)
    
    return pdata
# drop feature
def drop_feature(data, feature_arr):
    arr = feature_arr + ['index'] 
    
    """for i in range(20):
        arr.append('Q'+chr(i+97)+'A')
    for i in range(20):
        arr.append('Q'+chr(i+97)+'E')
    for i in range(1,14):
        arr.append(f'wr_{i:02d}')
    for i in range(1,4):
        arr.append(f'wf_{i:02d}')
    for i in range(1,11):
        arr.append(f'tp{i:02d}')"""
    

    pdata = data.drop(arr,axis=1)
    
    return pdata


In [4]:
def preprocess(data, datatype, feature_arr):
    
    pdata = data.copy()
    # fill NA
    pdata = fill_married(pdata)
    pdata = fill_education(pdata)
    pdata = fill_engnat(pdata)
    pdata = fill_hand(pdata)
    # feature engineering
    pdata = Mach_score(pdata)
    pdata = w_score(pdata)
    pdata = TIPI(pdata)
    # drop outlier
    pdata = drop_outlier(pdata,datatype)
    # feature banding
    pdata = age_band(pdata)
    pdata = family_band(pdata)
    pdata = E_band(pdata,10)
    # categorical value to numerical value
    pdata = cat_num(pdata)
    # drop feature
    pdata = drop_feature(pdata, feature_arr)
    # unify type of data
    pdata = pdata.astype(np.int)
    
    return pdata

In [5]:
def train_auc(model_arr, data, label):
    score = np.zeros((data.shape[0],2))
    num_model = len(model_arr)
    for i in range(num_model):
        score += model_arr[i].predict_proba(data)
    pred = np.divide(score,num_model)[:,1]
    
    return roc_auc_score(label, pred)

# XGBoost

In [15]:
train_data, val_data = train_test_split(train,test_size=0.14998, shuffle=False)

In [16]:
def grid_search(max_depth_arr, n_estimators_arr, learning_rate_arr, min_child_arr,col_sample_arr):

    opt_auc = 0
    opt_para = {}
    
    for max_depth in max_depth_arr:
        for n_estimators in n_estimators_arr:
            for learning_rate in learning_rate_arr:
                for min_child in min_child_arr:
                    for col_sample in col_sample_arr:
                        print(f'{max_depth}_{n_estimators}_{learning_rate:.3f}_{min_child:02d}_{col_sample:.2f}', end=' ')
                        param = { 'max_depth' : max_depth,
                                'n_estimators' : n_estimators,
                                'learning_rate' : learning_rate,
                                 'min_child_weight' : min_child,
                                 'colsample_bytree' : col_sample,
                                'verbosity' : 0,
                                'objective' : 'binary:logistic',
                                'booster' : 'gbtree',
                                'subsample' : 0.8}

                        model = XGBClassifier(**param)
                        model.fit(train_x, train_y, verbose=False)
                        auc = train_auc([model], val_x, val_y) 
                        print('\033[34m' + f'{auc:.6f}' + '\033[0m', end=' ')

                        if (auc>opt_auc):
                            opt_auc = auc
                            opt_para = param
                            print('\033[31m' + f'{max_depth}_{n_estimators}_{learning_rate:.3f}_{min_child:02d}_{col_sample:.2f}' + '\033[0m')
                        else:
                            print()
    
    print('-'*30)
    print(f'{opt_para} = ' + '\033[34m' + f'{opt_auc:.6f}' + '\033[0m')
    
    return opt_auc, opt_para

In [10]:
auc, para = grid_search(max_depth_arr=[8,9],n_estimators_arr=[200,250,300],learning_rate_arr=[0.010], min_child_arr=range(1,11), col_sample_arr=[0.75,0.8,0.85])

8_200_0.010_01_0.75 [34m0.768783[0m [31m8_200_0.010_01_0.75[0m
8_200_0.010_01_0.80 [34m0.768233[0m 
8_200_0.010_01_0.85 [34m0.768189[0m 
8_200_0.010_02_0.75 [34m0.768878[0m [31m8_200_0.010_02_0.75[0m
8_200_0.010_02_0.80 [34m0.768168[0m 
8_200_0.010_02_0.85 [34m0.768210[0m 
8_200_0.010_03_0.75 [34m0.768450[0m 
8_200_0.010_03_0.80 [34m0.767992[0m 
8_200_0.010_03_0.85 [34m0.768350[0m 
8_200_0.010_04_0.75 [34m0.768546[0m 
8_200_0.010_04_0.80 [34m0.767918[0m 
8_200_0.010_04_0.85 [34m0.767939[0m 
8_200_0.010_05_0.75 [34m0.768700[0m 
8_200_0.010_05_0.80 [34m0.768138[0m 
8_200_0.010_05_0.85 [34m0.768314[0m 
8_200_0.010_06_0.75 [34m0.768796[0m 
8_200_0.010_06_0.80 [34m0.768319[0m 
8_200_0.010_06_0.85 [34m0.768236[0m 
8_200_0.010_07_0.75 [34m0.768590[0m 
8_200_0.010_07_0.80 [34m0.767794[0m 
8_200_0.010_07_0.85 [34m0.768094[0m 
8_200_0.010_08_0.75 [34m0.768623[0m 
8_200_0.010_08_0.80 [34m0.767572[0m 
8_200_0.010_08_0.85 [34m0.767744[0m 
8_200_0.

In [29]:
def grid_search_lgbm(max_depth_arr, leaves_arr, n_estimators_arr, learning_rate_arr, weight_arr, col_sample_arr):
    opt_auc = 0
    opt_para = {}
   
    train_x = preprocess(train,'train')
    train_y = train_x['voted']
    train_x = train_x.drop(['voted'],axis=1)
    val_x = preprocess(val,'test')
    val_y = val_x['voted']
    val_x = val_x.drop(['voted'],axis=1)
    
    for max_depth in max_depth_arr:
        for num_leaves in leaves_arr:
            for n_estimators in n_estimators_arr:
                for learning_rate in learning_rate_arr:
                    #for samples in samples_arr:
                    for weight in weight_arr:
                        for col_sample in col_sample_arr:
                            print(f'{num_leaves}_{n_estimators}_{learning_rate:.3f}_{weight:03d}_{col_sample:.2f}', end=' ')
                            param = { 'num_leaves' : num_leaves,
                                    'max_depth' : max_depth,
                                    'n_estimators' : n_estimators,
                                    'learning_rate' : learning_rate,
                                    #'min_child_samples' : samples,
                                    'min_child_weight' : weight,
                                    'colsample_bytree' : col_sample,
                                    'verbosity' : -1,
                                    'objective' : 'binary',
                                    'boosting_type' : 'dart',
                                    'subsample' : 0.8,
                                    'max_depth' : -1,
                                    'force_row_wise' : True}

                            model = LGBMClassifier(**param)
                            model.fit(train_x, train_y, verbose=False)
                            auc = train_auc([model], val_x, val_y) 
                            print('\033[34m' + f'{auc:.6f}' + '\033[0m', end=' ')

                            if (auc>opt_auc):
                                opt_auc = auc
                                opt_para = param
                                print('\033[31m' + f'{num_leaves}_{n_estimators}_{learning_rate:.3f}_{weight:03d}_{col_sample:.2f}' + '\033[0m')
                            else:
                                print()
    
    print('-'*30)
    print(f'{opt_para} = ' + '\033[34m' + f'{opt_auc:.6f}' + '\033[0m')
    
    return opt_auc, opt_para

In [30]:
auc, para = grid_search_lgbm(max_depth_arr = [7], leaves_arr=[300], n_estimators_arr=[300], learning_rate_arr=[0.01], weight_arr=[50], col_sample_arr=[0.8])

300_300_0.010_050_0.80 [34m0.767353[0m [31m300_300_0.010_050_0.80[0m
------------------------------
{'num_leaves': 300, 'max_depth': -1, 'n_estimators': 300, 'learning_rate': 0.01, 'min_child_weight': 50, 'colsample_bytree': 0.8, 'verbosity': -1, 'objective': 'binary', 'boosting_type': 'dart', 'subsample': 0.8, 'force_row_wise': True} = [34m0.767353[0m
