# Import

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import *
import itertools

import os

import xgboost as xgb 
from xgboost import plot_importance , XGBClassifier

import lightgbm as lgbm
from lightgbm import LGBMClassifier

from keras.utils.np_utils import to_categorical 
from keras.models import *
from keras.layers import *
from keras.optimizers import *
from keras.callbacks import *

from tqdm import tqdm, notebook
import time

# Read Data

In [2]:
train_original = pd.read_csv('./open data/train.csv')
test_original = pd.read_csv('./open data/test_x.csv')
train = train_original.copy()
test = test_original.copy()

In [3]:
val = train.iloc[38703:,:]
train = train.iloc[:38703,:]

In [4]:
def drop_outlier(data):
    outlier_id = np.where(data['familysize']>=100)[0]
    pdata = data.drop(outlier_id)
    
    return pdata

def drop_feature(data):
    feature_arr = ['index'] # urban, Q_A, Q_E, wr_, wf_
    for i in range(20):
        feature_arr.append('Q'+chr(i+97)+'A')
    for i in range(1,14):
        feature_arr.append(f'wr_{i:02d}')
    for i in range(1,4):
        feature_arr.append(f'wf_{i:02d}')
    for i in range(1,11):
        feature_arr.append(f'tp{i:02d}')

    pdata = data.drop(feature_arr,axis=1)
    
    return pdata

def age_band(data):
    pdata = data.copy()
    pdata['age_group'].replace(['10s','20s','30s','40s','50s','60s','+70s'],[1,2,3,4,5,5,5],inplace=True)
    
    return pdata

def cat_gender(data):
    feature = 'gender'
    pdata = data.copy()
    pdata[feature].replace(['Male','Female'],[0,1],inplace=True)
    
    return pdata

def cat_race(data):
    feature = 'race'
    pdata = data.copy()
    unique = ['White', 'Asian', 'Other', 'Black', 'Native American', 'Arab', 'Indigenous Australian']
    pdata[feature].replace(unique,[0,1,2,3,2,2,2],inplace=True)
    
    return pdata

def cat_religion(data):
    feature = 'religion'
    pdata = data.copy()
    unique = ['Other', 'Hindu', 'Agnostic', 'Atheist', 'Christian_Other',
       'Christian_Catholic', 'Muslim', 'Buddhist', 'Christian_Protestant',
       'Jewish', 'Christian_Mormon', 'Sikh']
    pdata[feature].replace(unique,[3,3,1,0,2,2,3,3,2,3,3,3],inplace=True)
    
    return pdata

def cat_num(data):
    pdata = data.copy()
    pdata = cat_gender(pdata)
    pdata = cat_race(pdata)
    pdata = cat_religion(pdata)
    
    return pdata

def E_band(data, num_band):
    pdata = data.copy()
    for i in range(20):
        col = 'Q'+chr(i+97)+'E'
        pdata[col] = pd.qcut(pdata[col],num_band)
        unique = pdata[col].unique()
        pdata[col].replace(unique,range(num_band),inplace=True)
        
    return pdata

def family_band(data):
    pdata = data.copy()
    pdata.loc[pdata.familysize >= 4,'familysize'] = 4
    
    return pdata

def fill_married(data):
    pdata = data.copy()
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='10s'),'married' ] = 1
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='20s'),'married' ] = 1
    pdata.loc[pdata.married==0,'married'] = 2
    
    return pdata

def fill_education(data):
    pdata = data.copy()
    pdata.loc[(pdata.education==0)&(pdata.age_group=='10s'),'education'] = 2
    pdata.loc[pdata.education==0,'education'] = 3

    return pdata

def fill_engnat(data):
    pdata = data.copy()
    pdata.loc[pdata.engnat==0,'engnat'] = 1
    
    return pdata

def fill_hand(data):
    pdata = data.copy()
    pdata.loc[pdata.hand==0,'hand'] = 1
    
    return pdata

def Mach_score(data):
    pdata = data.copy()
    Answers = []
    for i in range(20):
        Answers.append('Q'+chr(97+i)+'A')
    reverse_col = ['QeA','QfA','QkA','QqA','QrA','QaA','QdA','QgA','QiA','QnA']
    for col in reverse_col:
        pdata[col] = -pdata[col]
    pdata['Mach_score'] = pdata[Answers].mean(axis=1)
    
    return pdata

def C_score(data):
    pdata = data.copy()
    Answers = []
    for i in range(1,14):
        Answers.append(f'wr_{i:02d}')
    for i in range(1,4):
        pdata[f'wf_{i:02d}'] = -pdata[f'wf_{i:02d}']
        Answers.append(f'wf_{i:02d}')
    
    pdata['C_score'] = pdata[Answers].mean(axis=1)
    
    return pdata

def TIPI_score(data):
    pdata = data.copy()
    pdata['tp_score_1'] = pdata['tp01'] - pdata['tp06']
    pdata['tp_score_2'] = pdata['tp07'] - pdata['tp02']
    pdata['tp_score_3'] = pdata['tp03'] - pdata['tp08']
    pdata['tp_score_4'] = pdata['tp09'] - pdata['tp04']
    pdata['tp_score_5'] = pdata['tp05'] - pdata['tp10']
    
    return pdata


In [5]:
def preprocess(data):
    
    pdata = data.copy()
    pdata = fill_married(pdata)
    pdata = fill_education(pdata)
    pdata = fill_engnat(pdata)
    pdata = fill_hand(pdata)
    pdata = age_band(pdata)
    pdata = family_band(pdata)
    pdata = cat_num(pdata)
    pdata = E_band(pdata,10)
    pdata = Mach_score(pdata)
    pdata = C_score(pdata)
    pdata = TIPI_score(pdata)
    
    pdata = drop_feature(pdata)
    pdata = pdata.astype(np.float32)
    
    return pdata

In [6]:
def train_auc(model_arr, data, label):
    score = np.zeros((data.shape[0],2))
    num_model = len(model_arr)
    for i in range(num_model):
        score += model_arr[i].predict_proba(data)
    pred = np.divide(score,num_model)[:,1]
    
    return roc_auc_score(label, pred)

# XGBoost

# CV

In [15]:
def grid_search(max_depth_arr, n_estimators_arr, learning_rate_arr):
    opt_auc = 0
    opt_para = []
   
    train_x = drop_outlier(train)
    train_x = preprocess(train_x)
    train_y = train_x['voted']
    train_x = train_x.drop(['voted'],axis=1)
    val_x = preprocess(val)
    val_y = val_x['voted']
    val_x = val_x.drop(['voted'],axis=1)
    
    for max_depth in max_depth_arr:
        for n_estimators in n_estimators_arr:
            for learning_rate in learning_rate_arr:
                
                print(f'{max_depth}_{n_estimators}_{learning_rate}', end=' ')
                model = XGBClassifier(max_depth=max_depth,learning_rate=learning_rate, booster='gbtree',n_estimators=n_estimators, objective='binary:logistic')
                model.fit(train_x, train_y, verbose=False)
                auc = train_auc([model], val_x, val_y) 
                print('\033[34m' + f'{auc:.4f}' + '\033[0m')
                
                if (auc>opt_auc):
                    opt_auc = auc
                    opt_para = [max_depth, n_estimators, learning_rate]
    print('\033[41m' + f'{opt_para} : {opt_auc:.4f}' + '\033[0m')
    
    return opt_auc, opt_para

In [16]:
auc, para = grid_search(max_depth_arr=[4,5,6],n_estimators_arr=[400,500,600],learning_rate_arr=[0.015,0.02,0.025])

4_400_0.015 [34m0.7652[0m
4_400_0.02 [34m0.7653[0m
4_400_0.025 [34m0.7656[0m
4_500_0.015 [34m0.7651[0m
4_500_0.02 [34m0.7654[0m
4_500_0.025 [34m0.7652[0m
4_600_0.015 [34m0.7652[0m
4_600_0.02 [34m0.7655[0m
4_600_0.025 [34m0.7652[0m
5_400_0.015 [34m0.7657[0m
5_400_0.02 [34m0.7654[0m
5_400_0.025 [34m0.7649[0m
5_500_0.015 [34m0.7654[0m
5_500_0.02 [34m0.7657[0m
5_500_0.025 [34m0.7647[0m
5_600_0.015 [34m0.7652[0m
5_600_0.02 [34m0.7652[0m
5_600_0.025 [34m0.7642[0m
6_400_0.015 [34m0.7650[0m
6_400_0.02 [34m0.7644[0m
6_400_0.025 [34m0.7644[0m
6_500_0.015 [34m0.7648[0m
6_500_0.02 [34m0.7640[0m
6_500_0.025 [34m0.7636[0m
6_600_0.015 [34m0.7641[0m
6_600_0.02 [34m0.7637[0m
6_600_0.025 [34m0.7621[0m
[41m0.7657 : [5, 500, 0.02][0m
