# Import

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import *
import itertools

import xgboost as xgb 
from xgboost import plot_importance , XGBClassifier

import lightgbm as lgbm
from lightgbm import LGBMClassifier

from tqdm import tqdm, notebook
import time

# Read Data

In [2]:
train_original = pd.read_csv('./open data/train.csv')
test_original = pd.read_csv('./open data/test_x.csv')
train = train_original.copy()
test = test_original.copy()

# Data

In [3]:
# fill NA
def fill_married(data):
    pdata = data.copy()
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='10s'),'married' ] = 1
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='20s'),'married' ] = 1
    pdata.loc[pdata.married==0,'married'] = 2
    
    return pdata

def fill_education(data):
    pdata = data.copy()
    pdata.loc[(pdata.education==0)&(pdata.age_group=='10s'),'education'] = 2
    pdata.loc[pdata.education==0,'education'] = 3

    return pdata

def fill_engnat(data):
    pdata = data.copy()
    pdata.loc[pdata.engnat==0,'engnat'] = 1
    
    return pdata

def fill_hand(data):
    pdata = data.copy()
    pdata.loc[pdata.hand==0,'hand'] = 1
    
    return pdata
# feature engineering
def Mach_score(data):
    pdata = data.copy()
    Answers = []
    for i in range(20):
        Answers.append('Q'+chr(97+i)+'A')
    reverse_col = ['QeA','QfA','QkA','QqA','QrA','QaA','QdA','QgA','QiA','QnA']
    for col in reverse_col:
        pdata[col] = -pdata[col]
    pdata['Mach_score'] = pdata[Answers].sum(axis=1)
    
    return pdata

def w_score(data):
    pdata = data.copy()
    wr = []
    wf = []
    for i in range(1,14):
        wr.append(f'wr_{i:02d}')
    for i in range(1,4):
        wf.append(f'wf_{i:02d}')
    
    pdata['wr'] = pdata[wr].sum(axis=1)
    pdata['wf'] = pdata[wf].sum(axis=1)
    
    return pdata

def TIPI(data):
    pdata = data.copy()
    pdata['tp_score_1'] = pdata['tp01'] - pdata['tp06']
    pdata['tp_score_2'] = pdata['tp07'] - pdata['tp02']
    pdata['tp_score_3'] = pdata['tp03'] - pdata['tp08']
    pdata['tp_score_4'] = pdata['tp09'] - pdata['tp04']
    pdata['tp_score_5'] = pdata['tp05'] - pdata['tp10']
    
    return pdata

# drop outlier
def drop_outlier(data, datatype):
    
    assert datatype == 'train' or datatype=='test', 'Wrong data type given'
    
    pdata = data.copy()
    if datatype=='train':
        
        out_arr = []
        out_arr.append( data[data.familysize>=16].index )
        out_arr.append( data[data.wr<=3].index )
        out_arr.append( data[data.wf>=2].index )

        out = []
        for outarr in out_arr:
            out = np.union1d(out, outarr)

        pdata = data.drop(out)
    
    return pdata
# feature banding
def age_band(data):
    pdata = data.copy()
    pdata['age_group'].replace(['10s','20s','30s','40s','50s','60s','+70s'],[1,2,3,4,5,5,5],inplace=True)
    
    return pdata

def E_band(data, num_band):
    pdata = data.copy()
    for i in range(20):
        col = 'Q'+chr(i+97)+'E'
        pdata[col] = pd.qcut(pdata[col],num_band)
        unique = pdata[col].unique()
        pdata[col].replace(unique,range(num_band),inplace=True)
        
    return pdata

def family_band(data):
    pdata = data.copy()
    pdata.loc[pdata.familysize >= 6,'familysize'] = 6
    
    return pdata
# categorical value to numerical value
def cat_gender(data):
    feature = 'gender'
    pdata = data.copy()
    pdata[feature].replace(['Male','Female'],[0,1],inplace=True)
    
    return pdata

def cat_race(data):
    feature = 'race'
    pdata = data.copy()
    unique = ['White', 'Asian', 'Other', 'Black', 'Native American', 'Arab', 'Indigenous Australian']
    pdata[feature].replace(unique,[0,1,2,3,4,5,6],inplace=True)
    
    return pdata

def cat_religion(data):
    feature = 'religion'
    pdata = data.copy()
    unique = ['Other', 'Hindu', 'Agnostic', 'Atheist', 'Christian_Other',
       'Christian_Catholic', 'Muslim', 'Buddhist', 'Christian_Protestant',
       'Jewish', 'Christian_Mormon', 'Sikh']
    pdata[feature].replace(unique,[11,10,0,1,2,3,4,5,6,7,8,9],inplace=True)
    
    return pdata

def cat_num(data):
    pdata = data.copy()
    pdata = cat_gender(pdata)
    pdata = cat_race(pdata)
    pdata = cat_religion(pdata)
    
    return pdata
# drop feature
def drop_feature(data, feature_arr):
    arr = feature_arr + ['index'] 
    
    """for i in range(20):
        arr.append('Q'+chr(i+97)+'A')
    for i in range(20):
        arr.append('Q'+chr(i+97)+'E')
    for i in range(1,14):
        arr.append(f'wr_{i:02d}')
    for i in range(1,4):
        arr.append(f'wf_{i:02d}')
    for i in range(1,11):
        arr.append(f'tp{i:02d}')"""
    

    pdata = data.drop(arr,axis=1)
    
    return pdata


In [4]:
def preprocess(data, datatype, feature_arr):
    
    pdata = data.copy()
    # fill NA
    pdata = fill_married(pdata)
    pdata = fill_education(pdata)
    pdata = fill_engnat(pdata)
    pdata = fill_hand(pdata)
    # feature engineering
    pdata = Mach_score(pdata)
    pdata = w_score(pdata)
    pdata = TIPI(pdata)
    # drop outlier
    pdata = drop_outlier(pdata,datatype)
    # feature banding
    pdata = age_band(pdata)
    pdata = family_band(pdata)
    pdata = E_band(pdata,10)
    # categorical value to numerical value
    pdata = cat_num(pdata)
    # drop feature
    pdata = drop_feature(pdata, feature_arr)
    # unify type of data
    pdata = pdata.astype(np.int)
    
    return pdata

In [29]:
def train_models(types ,params, arr, seed):
    
    assert len(params)==len(types), 'Num mismatch'
    
    if seed==False:
        train_data, val_data = train_test_split(train, test_size=0.14998, shuffle=False)
    else:
        train_data, val_data = train_test_split(train, test_size=0.14998, random_state=seed)
    
    model_list = []
    
    train_x = preprocess(train_data,'train',arr)
    train_y = train_x['voted']
    train_x = train_x.drop(['voted'],axis=1)
    val_x = preprocess(val_data,'test',arr)
    val_y = val_x['voted']
    val_x = val_x.drop(['voted'],axis=1)
    
    for i in range(len(types)):
        
        if types[i]=='xgb':
            model = XGBClassifier(**params[i])
            model.fit(train_x,train_y, verbose=False)
        elif types[i]=='lgbm':
            model = LGBMClassifier(**params[i])
            model.fit(train_x,train_y, verbose=False)
        else : 
            print('Wrong model type')
        
        model_list.append(model)
    
    return model_list, val_x, val_y

In [35]:
def train_models_(types ,params, arrs, seed):
    
    assert len(params)==len(types), 'Num mismatch'
    
    if seed==False:
        train_data, val_data = train_test_split(train, test_size=0.14998, shuffle=False)
    else:
        train_data, val_data = train_test_split(train, test_size=0.14998, random_state=seed)
    
    model_list = []
    val_x_list = []
    val_y_list = []
    
    for i in range(len(types)):
        train_x = preprocess(train_data,'train',arrs[i])
        train_y = train_x['voted']
        train_x = train_x.drop(['voted'],axis=1)
        val_x = preprocess(val_data,'test',arrs[i])
        val_y = val_x['voted']
        val_x = val_x.drop(['voted'],axis=1)
        
        if types[i]=='xgb':
            model = XGBClassifier(**params[i])
            model.fit(train_x,train_y, verbose=False)
        elif types[i]=='lgbm':
            model = LGBMClassifier(**params[i])
            model.fit(train_x,train_y, verbose=False)
        else : 
            print('Wrong model type')
        
        model_list.append(model)
        val_x_list.append(val_x)
        val_y_list.append(val_y)
    
    return model_list, val_x_list, val_y_list

In [83]:
def ensemble_model(types, params, arrs, seed, r):
    
    model_list, val_x_list, val_y_list = train_models_(types, params, arrs, seed)
    
    score = np.zeros((val_x_list[0].shape[0],2))

    num_model = len(model_list)
    for i in range(num_model):
        score += model_list[i].predict_proba(val_x_list[i])*r[i]
        
    return roc_auc_score(val_y_list[0], score[:,1])

In [6]:
opt = { 'max_depth' : 8,
        'n_estimators' : 200,
        'learning_rate' : 0.010,
         'min_child_weight' : 6,
         'colsample_bytree' : 0.8,
        'verbosity' : 0,
        'objective' : 'binary:logistic',
        'booster' : 'gbtree',
        'subsample' : 0.8}
lgbm_opt= { 'max_depth' : 5, 
            'num_leaves' : 359,
            'n_estimators' : 581,
            'learning_rate' : 0.05,
            'min_child_weight' : 37,
            'colsample_bytree' : 0.67,
            'feature_fraction' : 0.77,
            'bagging_fraction' : 0.91,
            'lambda_l1' : 0.41,
            'lambda_l2' : 0.09,
            'objective' : 'binary',
            'boosting_type' : 'dart'
            }

In [102]:
opt_arr_1 = ['QbA','QcA','QjA','QhA','QmA','QtA','QlA','QkA','QoA',
 'QsA','QrA', 'QeA',  'QnA','QgA', 'QdA'] + \
['QaE','QbE','QcE','QeE','QgE','QmE','QfE','QiE','QoE',
 'QlE','QrE','QpE','QnE','QtE','QsE','QkE'] + \
['tp01','tp04','tp05','tp09','tp10','tp02', 'tp08'] +\
['wf_01', 'wf_03'] + \
['wr_06','wr_09', 'wr_11','wr_07', 'wr_12','wr_13'] +\
['tp_score_4','hand']

opt_arr_2 = ['QjA','QaE', 'QbA', 'QeE', 'QfE', 'QhA', 'QiE', 'tp09', 'tp_score_4',
      'QbE', 'QtA', 'tp01', 'tp_score_2',
       'QmA', 'QmE', 'tp04',
       'QgE', 'QkA', 'QoE', 'QsA',
       'QlE', 'QoE', 'QrE', 'wf_03',
       'QoA', 'QlA', 'QsE', 'tp10',
       'QpE', 'tp08', 'wf_01',
       'QkE', 'QrA', 'wr_05', 'wr_09', 'wr_10', 'wr_11',
       'QgA', 'QtE', 'hand', 'tp06', 'QeA', 'wr_06', 'wr_12',
       'wr_03', 'wr_07', 'QdA', 'QdE', 'QnE', 'wr_13'
      ]

opt_arr_3 = ['QaE', 'QbA', 'QbE', 'QfE', 'QgE', 'QhA', 'QjA', 'QkA', 'QmA', 'QmE', 
             'QqA', 'QrA', 'QrE', 'QsA', 'QtA', 'tp04', 'tp09', 'wf_03', 'wr_13', 
             'tp_score_4', 'QdA', 'QiE', 'QlA', 'QlE', 'QoA', 'QpE', 'tp01', 
             'wr_06', 'QnE', 'QoE', 'QsE', 'hand', 'tp10', 'wf_01', 'wr_12', 
             'QeA', 'QeE', 'wr_11', 'QcA', 'tp02', 'wr_10']

lgbm_opt_arr = ['tp_score_2', 'QaE', 'QgE', 'QhA', 'QjA', 'QmA', 'QoA', 'QoE', 
                'QpE', 'QrE', 'tp09', 'wf_02', 'tp_score_4', 'QbA', 'QfE', 'QiA', 
                'QlE', 'QtA', 'tp04', 'QsA', 'tp01', 'wr_13', 'QmE', 'QsE', 'wf_03']

In [93]:
def ensemble_cv(types, params, arrs, seed, r_bound):
    
    num_model = len(types)
    assert num_model==len(r_bound)+1, 'bound mismatch'
    model_list, val_x_list, val_y_list = train_models_(types, params, arrs, seed)
    
    opt_auc = 0
    opt_r = []
    
    if num_model==2:
        for r1 in r_bound[0]:
            if (r1<=1):
                r = [round(r1,2), round(1-r1,2)]

                score = np.zeros((val_x_list[0].shape[0],2))
                for i in range(num_model):
                    score += model_list[i].predict_proba(val_x_list[i])*r[i]

                auc = roc_auc_score(val_y_list[0], score[:,1])

                print(f'{r}', end=' ')
                print('\033[31m' + f'{auc:.4f}' + '\033[0m', end=' ')

                if auc>opt_auc :
                    opt_auc = auc
                    opt_r = r
                    print('\033[34m'+f'{r}'+'\033[0m')
                else:
                    print()    
    
    elif num_model==3:
        for r1 in r_bound[0]:
            for r2 in r_bound[1]:
                if (r1+r2<=1):
                    r = [round(r1,2), round(r2,2), round(1-r1-r2,2)]

                    score = np.zeros((val_x_list[0].shape[0],2))
                    for i in range(num_model):
                        score += model_list[i].predict_proba(val_x_list[i])*r[i]

                    auc = roc_auc_score(val_y_list[0], score[:,1])

                    print(f'{r}', end=' ')
                    print('\033[31m' + f'{auc:.4f}' + '\033[0m', end=' ')

                    if auc>opt_auc :
                        opt_auc = auc
                        opt_r = r
                        print('\033[34m'+f'{r}'+'\033[0m')
                    else:
                        print()    
    
    else:
        print('Too many models')
    
    return opt_auc, opt_r

In [105]:
auc1, r1 = ensemble_cv(['xgb','lgbm'], [opt,lgbm_opt], [opt_arr_3,lgbm_opt_arr], False, [np.linspace(0.7,0.9,21)])

[0.7, 0.3] [31m0.7755[0m [34m[0.7, 0.3][0m
[0.71, 0.29] [31m0.7755[0m 
[0.72, 0.28] [31m0.7755[0m 
[0.73, 0.27] [31m0.7755[0m 
[0.74, 0.26] [31m0.7755[0m [34m[0.74, 0.26][0m
[0.75, 0.25] [31m0.7755[0m [34m[0.75, 0.25][0m
[0.76, 0.24] [31m0.7755[0m [34m[0.76, 0.24][0m
[0.77, 0.23] [31m0.7755[0m 
[0.78, 0.22] [31m0.7755[0m 
[0.79, 0.21] [31m0.7755[0m 
[0.8, 0.2] [31m0.7755[0m 
[0.81, 0.19] [31m0.7754[0m 
[0.82, 0.18] [31m0.7754[0m 
[0.83, 0.17] [31m0.7754[0m 
[0.84, 0.16] [31m0.7754[0m 
[0.85, 0.15] [31m0.7754[0m 
[0.86, 0.14] [31m0.7754[0m 
[0.87, 0.13] [31m0.7754[0m 
[0.88, 0.12] [31m0.7754[0m 
[0.89, 0.11] [31m0.7753[0m 
[0.9, 0.1] [31m0.7753[0m 


In [106]:
auc1, r1

(0.775491177146977, [0.76, 0.24])

In [None]:
(0.774750587188821, [0.56, 0.44])
(0.7751170406650272, [0.62, 0.38])