# Import

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

import xgboost as xgb 
from xgboost import plot_importance , XGBClassifier

import lightgbm as lgbm
from lightgbm import LGBMClassifier

from sklearn.metrics import *

In [2]:
from tqdm import notebook

import eli5
from eli5.sklearn import PermutationImportance



# Read Data

In [3]:
train_original = pd.read_csv('./open data/train.csv')
test_original = pd.read_csv('./open data/test_x.csv')
train = train_original.copy()
test = test_original.copy()

# Data

In [27]:
# fill NA
def fill_married(data):
    pdata = data.copy()
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='10s'),'married' ] = 1
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='20s'),'married' ] = 1
    pdata.loc[pdata.married==0,'married'] = 2
    
    return pdata

def fill_education(data):
    pdata = data.copy()
    pdata.loc[(pdata.education==0)&(pdata.age_group=='10s'),'education'] = 2
    pdata.loc[pdata.education==0,'education'] = 3

    return pdata

def fill_engnat(data):
    pdata = data.copy()
    pdata.loc[pdata.engnat==0,'engnat'] = 1
    
    return pdata

def fill_hand(data):
    pdata = data.copy()
    pdata.loc[pdata.hand==0,'hand'] = 1
    
    return pdata
# feature engineering
def Mach_score(data):
    pdata = data.copy()
    Answers = []
    for i in range(20):
        Answers.append('Q'+chr(97+i)+'A')
    reverse_col = ['QeA','QfA','QkA','QqA','QrA','QaA','QdA','QgA','QiA','QnA']
    for col in reverse_col:
        pdata[col] = -pdata[col]
    pdata['Mach_score'] = pdata[Answers].sum(axis=1)
    
    return pdata

def w_score(data):
    pdata = data.copy()
    wr = []
    wf = []
    for i in range(1,14):
        wr.append(f'wr_{i:02d}')
    for i in range(1,4):
        wf.append(f'wf_{i:02d}')
    
    pdata['wr'] = pdata[wr].sum(axis=1)
    pdata['wf'] = pdata[wf].sum(axis=1)
    
    return pdata

def TIPI(data):
    pdata = data.copy()
    pdata['tp_score_1'] = pdata['tp01'] - pdata['tp06']
    pdata['tp_score_2'] = pdata['tp07'] - pdata['tp02']
    pdata['tp_score_3'] = pdata['tp03'] - pdata['tp08']
    pdata['tp_score_4'] = pdata['tp09'] - pdata['tp04']
    pdata['tp_score_5'] = pdata['tp05'] - pdata['tp10']
    
    return pdata

# drop outlier
def drop_outlier(data, datatype):
    
    assert datatype == 'train' or datatype=='test', 'Wrong data type given'
    
    pdata = data.copy()
    if datatype=='train':
        
        out_arr = []
        out_arr.append( np.where(data['familysize']>=16)[0] )
        out_arr.append( np.where(data.wr<=3)[0] )
        out_arr.append( np.where(data.wf>=2)[0] )

        out = []
        for outarr in out_arr:
            out = np.union1d(out, outarr)

        pdata = data.drop(out)
    
    return pdata
# feature banding
def age_band(data):
    pdata = data.copy()
    pdata['age_group'].replace(['10s','20s','30s','40s','50s','60s','+70s'],[1,2,3,4,5,5,5],inplace=True)
    
    return pdata

def E_band(data, num_band):
    pdata = data.copy()
    for i in range(20):
        col = 'Q'+chr(i+97)+'E'
        pdata[col] = pd.qcut(pdata[col],num_band)
        unique = pdata[col].unique()
        pdata[col].replace(unique,range(num_band),inplace=True)
        
    return pdata

def family_band(data):
    pdata = data.copy()
    pdata.loc[pdata.familysize >= 6,'familysize'] = 6
    
    return pdata
# categorical value to numerical value
def cat_gender(data):
    feature = 'gender'
    pdata = data.copy()
    pdata[feature].replace(['Male','Female'],[0,1],inplace=True)
    
    return pdata

def cat_race(data):
    feature = 'race'
    pdata = data.copy()
    unique = ['White', 'Asian', 'Other', 'Black', 'Native American', 'Arab', 'Indigenous Australian']
    pdata[feature].replace(unique,[0,1,2,3,4,5,6],inplace=True)
    
    return pdata

def cat_religion(data):
    feature = 'religion'
    pdata = data.copy()
    unique = ['Other', 'Hindu', 'Agnostic', 'Atheist', 'Christian_Other',
       'Christian_Catholic', 'Muslim', 'Buddhist', 'Christian_Protestant',
       'Jewish', 'Christian_Mormon', 'Sikh']
    pdata[feature].replace(unique,[11,10,0,1,2,3,4,5,6,7,8,9],inplace=True)
    
    return pdata

def cat_num(data):
    pdata = data.copy()
    pdata = cat_gender(pdata)
    pdata = cat_race(pdata)
    pdata = cat_religion(pdata)
    
    return pdata
# drop feature
def drop_feature(data, feature_arr):
    feature_arr += ['index'] 
    
    for i in range(20):
        feature_arr.append('Q'+chr(i+97)+'A')
    for i in range(20):
        feature_arr.append('Q'+chr(i+97)+'E')
    for i in range(1,14):
        feature_arr.append(f'wr_{i:02d}')
    for i in range(1,4):
        feature_arr.append(f'wf_{i:02d}')
    for i in range(1,11):
        feature_arr.append(f'tp{i:02d}')
    

    pdata = data.drop(feature_arr,axis=1)
    
    return pdata


In [5]:
def preprocess(data, datatype, feature_arr):
    
    pdata = data.copy()
    # fill NA
    pdata = fill_married(pdata)
    pdata = fill_education(pdata)
    pdata = fill_engnat(pdata)
    pdata = fill_hand(pdata)
    # feature engineering
    pdata = Mach_score(pdata)
    pdata = w_score(pdata)
    pdata = TIPI(pdata)
    # drop outlier
    pdata = drop_outlier(pdata,datatype)
    # feature banding
    pdata = age_band(pdata)
    pdata = family_band(pdata)
    pdata = E_band(pdata,10)
    # categorical value to numerical value
    pdata = cat_num(pdata)
    # drop feature
    pdata = drop_feature(pdata, feature_arr)
    # unify type of data
    pdata = pdata.astype(np.int)
    
    return pdata

# XGBoost

In [6]:
val = train.iloc[38703:,:] # 15%
train = train.iloc[:38703,:]

In [7]:
opt = { 'max_depth' : 8,
        'n_estimators' : 200,
        'learning_rate' : 0.010,
         'min_child_weight' : 6,
         'colsample_bytree' : 0.8,
        'verbosity' : 0,
        'objective' : 'binary:logistic',
        'booster' : 'gbtree',
        'subsample' : 0.8}

# Permutation Importance

In [7]:
def permutation(param):
    train_x = preprocess(train,'train')
    train_y = train_x['voted']
    train_x = train_x.drop(['voted'],axis=1)
    val_x = preprocess(val,'test')
    val_y = val_x['voted']
    val_x = val_x.drop(['voted'],axis=1)
    
    model = XGBClassifier(**param)
    model.fit(train_x,train_y,verbose=False)
    
    PI = PermutationImportance(model, random_state=0).fit(val_x,val_y)
    return eli5.show_weights(PI, feature_names=val_x.columns.tolist(), top=100)

In [11]:
permutation(opt)

Weight,Feature
0.0624  ± 0.0048,education
0.0491  ± 0.0029,age_group
0.0252  ± 0.0060,race
0.0073  ± 0.0026,engnat
0.0038  ± 0.0027,married
0.0036  ± 0.0021,religion
0.0018  ± 0.0005,urban
0.0014  ± 0.0009,QdE
0.0013  ± 0.0014,tp_score_4
0.0012  ± 0.0038,tp_score_3


# XGB Permutation Importance

In [47]:
def permutation_importance(param, th, bad, verbose):
    
    train_x = preprocess(train,'train',bad)
    train_y = train_x['voted']
    train_x = train_x.drop(['voted'],axis=1)
    val_x = preprocess(val,'test',bad)
    val_y = val_x['voted']
    val_x = val_x.drop(['voted'],axis=1)
    
    model = XGBClassifier(**param)
    model.fit(train_x,train_y,verbose=False)
    
    results = {}
    
    pred = model.predict_proba(val_x)[:,1]
    results['base_score'] = roc_auc_score(val_y, pred)
    
    if verbose:
        print(f'Base auc {results["base_score"]:.6f}')
    
    for col in notebook.tqdm(val_x.columns):
        freezed = val_x[col].copy()
        results[col] = 0.0
        for i in range(5):
            val_x[col] = np.random.permutation(val_x[col])
            pred_per = model.predict_proba(val_x)[:,1]
            results[col] += roc_auc_score(val_y, pred_per)
            val_x[col] = freezed
        results[col] = np.divide(results[col],5)
        
        if verbose:
            print(f'{col} - {results[col]:.6f}')
        
    bad_features = [ k for k in results if results[k]>results['base_score']+th ]
    
    # bad_features.remove('base_score')
    
    return results, bad_features

In [74]:
res, bad_features = permutation_importance(opt, 0.0001, ['tp_score_4','wf'], True)

Base auc 0.770830


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16.0), HTML(value='')))

age_group - 0.705975
education - 0.711832
engnat - 0.763512
familysize - 0.769585
gender - 0.769902
hand - 0.770855
married - 0.762402
race - 0.744827
religion - 0.764277
urban - 0.769677
Mach_score - 0.770525
wr - 0.768145
tp_score_1 - 0.767509
tp_score_2 - 0.770803
tp_score_3 - 0.768957
tp_score_5 - 0.770749



In [73]:
bad_features

[]

# LGBM Permutation Importance

In [77]:
def permutation_importance_lgbm(param, th, bad, verbose):
    
    train_x = preprocess(train,'train',bad)
    train_y = train_x['voted']
    train_x = train_x.drop(['voted'],axis=1)
    val_x = preprocess(val,'test',bad)
    val_y = val_x['voted']
    val_x = val_x.drop(['voted'],axis=1)
    
    model = LGBMClassifier(**param)
    model.fit(train_x,train_y,verbose=False)
    
    results = {}
    
    pred = model.predict_proba(val_x)[:,1]
    results['base_score'] = roc_auc_score(val_y, pred)
    
    if verbose:
        print(f'Base auc {results["base_score"]:.6f}')
    
    for col in notebook.tqdm(val_x.columns):
        freezed = val_x[col].copy()
        results[col] = 0.0
        for i in range(5):
            val_x[col] = np.random.permutation(val_x[col])
            pred_per = model.predict_proba(val_x)[:,1]
            results[col] += roc_auc_score(val_y, pred_per)
            val_x[col] = freezed
        results[col] = np.divide(results[col],5)
        
        if verbose:
            print(f'{col} - {results[col]:.6f}')
        
    bad_features = [ k for k in results if results[k]>results['base_score']+th ]
    
    # bad_features.remove('base_score')
    
    return results, bad_features

In [78]:
lgbm_opt= { 'max_depth' : 5, 
            'num_leaves' : 359,
            'n_estimators' : 581,
            'learning_rate' : 0.05,
            'min_child_weight' : 37,
            'colsample_bytree' : 0.67,
            'feature_fraction' : 0.77,
            'bagging_fraction' : 0.91,
            'lambda_l1' : 0.41,
            'lambda_l2' : 0.09,
            'objective' : 'binary',
            'boosting_type' : 'dart'
            }

In [97]:
results, bad_features = permutation_importance_lgbm(lgbm_opt,0.0001,['tp_score_2','tp_score_4','hand'],True)

Base auc 0.769609


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))

age_group - 0.705584
education - 0.698176
engnat - 0.764185
familysize - 0.769010
gender - 0.768824
married - 0.762123
race - 0.746075
religion - 0.762199
urban - 0.769079
Mach_score - 0.768072
wr - 0.766597
wf - 0.769211
tp_score_1 - 0.768088
tp_score_3 - 0.769000
tp_score_5 - 0.769584



In [98]:
bad_features

[]