In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

import xgboost as xgb 
from xgboost import plot_importance , XGBClassifier

import lightgbm as lgbm
from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score

In [2]:
train_original = pd.read_csv('./open data/train.csv')
test_original = pd.read_csv('./open data/test_x.csv')
train = train_original.copy()
test = test_original.copy()

In [3]:
# fill NA
def fill_married(data):
    pdata = data.copy()
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='10s'),'married' ] = 1
    pdata.loc[ (pdata.married==0)&(pdata.age_group=='20s'),'married' ] = 1
    pdata.loc[pdata.married==0,'married'] = 2
    
    return pdata

def fill_education(data):
    pdata = data.copy()
    pdata.loc[(pdata.education==0)&(pdata.age_group=='10s'),'education'] = 2
    pdata.loc[pdata.education==0,'education'] = 3

    return pdata

def fill_engnat(data):
    pdata = data.copy()
    pdata.loc[pdata.engnat==0,'engnat'] = 1
    
    return pdata

def fill_hand(data):
    pdata = data.copy()
    pdata.loc[pdata.hand==0,'hand'] = 1
    
    return pdata
# feature engineering
def Mach_score(data):
    pdata = data.copy()
    Answers = []
    for i in range(20):
        Answers.append('Q'+chr(97+i)+'A')
    reverse_col = ['QeA','QfA','QkA','QqA','QrA','QaA','QdA','QgA','QiA','QnA']
    for col in reverse_col:
        pdata[col] = -pdata[col]
    pdata['Mach_score'] = pdata[Answers].sum(axis=1)
    
    return pdata

def w_score(data):
    pdata = data.copy()
    wr = []
    wf = []
    for i in range(1,14):
        wr.append(f'wr_{i:02d}')
    for i in range(1,4):
        wf.append(f'wf_{i:02d}')
    
    pdata['wr'] = pdata[wr].sum(axis=1)
    pdata['wf'] = pdata[wf].sum(axis=1)
    
    return pdata

def TIPI(data):
    pdata = data.copy()
    pdata['tp_score_1'] = pdata['tp01'] - pdata['tp06']
    pdata['tp_score_2'] = pdata['tp07'] - pdata['tp02']
    pdata['tp_score_3'] = pdata['tp03'] - pdata['tp08']
    pdata['tp_score_4'] = pdata['tp09'] - pdata['tp04']
    pdata['tp_score_5'] = pdata['tp05'] - pdata['tp10']
    
    return pdata

# drop outlier
def drop_outlier(data, datatype):
    
    assert datatype == 'train' or datatype=='test', 'Wrong data type given'
    
    pdata = data.copy()
    if datatype=='train':
        
        out_arr = []
        out_arr.append( np.where(data['familysize']>=16)[0] )
        out_arr.append( np.where(data.wr<=3)[0] )
        out_arr.append( np.where(data.wf>=2)[0] )

        out = []
        for outarr in out_arr:
            out = np.union1d(out, outarr)

        pdata = data.drop(out)
    
    return pdata
# feature banding
def age_band(data):
    pdata = data.copy()
    pdata['age_group'].replace(['10s','20s','30s','40s','50s','60s','+70s'],[1,2,3,4,5,5,5],inplace=True)
    
    return pdata

def E_band(data, num_band):
    pdata = data.copy()
    for i in range(20):
        col = 'Q'+chr(i+97)+'E'
        pdata[col] = pd.qcut(pdata[col],num_band)
        unique = pdata[col].unique()
        pdata[col].replace(unique,range(num_band),inplace=True)
        
    return pdata

def family_band(data):
    pdata = data.copy()
    pdata.loc[pdata.familysize >= 4,'familysize'] = 4
    
    return pdata
# categorical value to numerical value
def cat_gender(data):
    feature = 'gender'
    pdata = data.copy()
    pdata[feature].replace(['Male','Female'],[0,1],inplace=True)
    
    return pdata

def cat_race(data):
    feature = 'race'
    pdata = data.copy()
    unique = ['White', 'Asian', 'Other', 'Black', 'Native American', 'Arab', 'Indigenous Australian']
    pdata[feature].replace(unique,[0,1,2,3,2,2,2],inplace=True)
    
    return pdata

def cat_religion(data):
    feature = 'religion'
    pdata = data.copy()
    unique = ['Other', 'Hindu', 'Agnostic', 'Atheist', 'Christian_Other',
       'Christian_Catholic', 'Muslim', 'Buddhist', 'Christian_Protestant',
       'Jewish', 'Christian_Mormon', 'Sikh']
    pdata[feature].replace(unique,[3,3,1,0,2,2,3,3,2,3,3,3],inplace=True)
    
    return pdata

def cat_num(data):
    pdata = data.copy()
    pdata = cat_gender(pdata)
    pdata = cat_race(pdata)
    pdata = cat_religion(pdata)
    
    return pdata
# drop feature
def drop_feature(data):
    feature_arr = ['index','wf'] 
    for i in range(20):
        feature_arr.append('Q'+chr(i+97)+'A')
    for i in range(1,14):
        feature_arr.append(f'wr_{i:02d}')
    for i in range(1,4):
        feature_arr.append(f'wf_{i:02d}')
    for i in range(1,11):
        feature_arr.append(f'tp{i:02d}')
    for i in range(20):
        feature_arr.append('Q'+chr(i+97)+'E')

    pdata = data.drop(feature_arr,axis=1)
    
    return pdata


In [4]:
def preprocess(data, datatype):
    
    pdata = data.copy()
    # fill NA
    pdata = fill_married(pdata)
    pdata = fill_education(pdata)
    pdata = fill_engnat(pdata)
    pdata = fill_hand(pdata)
    # feature engineering
    pdata = Mach_score(pdata)
    pdata = w_score(pdata)
    pdata = TIPI(pdata)
    # drop outlier
    pdata = drop_outlier(pdata,datatype)
    # feature banding
    pdata = age_band(pdata)
    pdata = family_band(pdata)
    pdata = E_band(pdata,10)
    # categorical value to numerical value
    pdata = cat_num(pdata)
    # drop feature
    pdata = drop_feature(pdata)
    # unify type of data
    pdata = pdata.astype(np.int)
    
    return pdata

In [5]:
def train_auc(model_arr, data, label):
    score = np.zeros((data.shape[0],2))
    num_model = len(model_arr)
    for i in range(num_model):
        score += model_arr[i].predict_proba(data)
    pred = np.divide(score,num_model)[:,1]
    
    return roc_auc_score(label, pred)

In [6]:
def submission(test_data, model_arr,file_name):
    score = np.zeros((test_data.shape[0],2))
    num_model = len(model_arr)
    for i in range(num_model):
        score += model_arr[i].predict_proba(test_data)
    pred = np.divide(score,num_model)[:,1]
    data = {'index':test['index'],'voted':pred}
    submission = DataFrame(data)
    submission.to_csv('./submission/'+file_name+'.csv',index=False)
    
    return pred

# RANSAC

In [7]:
param = { 'max_depth' : 8,
        'n_estimators' : 200,
        'learning_rate' : 0.1,
        'min_child_weight' : 6,
        'colsample_bytree' : 0.8,
        'verbosity' : 0,
        'objective' : 'binary:logistic',
        'booster' : 'gbtree',
        'subsample' : 0.8}

In [8]:
def ransac(num_iter, num_sample, param):

    best = None
    consensus = 0
    
    preprocessed = preprocess(train,'train')
    val_x = preprocessed.copy()
    val_y = val_x['voted']
    val_x = val_x.drop(['voted'],axis=1)

    for i in range(num_iter):
        print(f'{i}', end=' ')
        train_sample = preprocessed.sample(n=num_sample, random_state=None, axis=0)

        train_y = train_sample['voted']
        train_x = train_sample.drop(['voted'],axis=1)

        model = XGBClassifier(**param)
        model.fit(train_x, train_y, verbose=False)
        
        auc = train_auc([model],val_x,val_y)

        if (consensus < auc):
            consensus = auc
            best = model
            print('\033[31m' + f'{auc:.6f}' + '\033[0m')
        else:
            print(f'{auc:.6f}')
    
    return best

In [40]:
ransac_model = ransac(num_iter=200, num_sample=1000, param)

0 [31m0.762431[0m
1 0.760228
2 0.759583
3 0.760719
4 0.757064
5 0.757751
6 0.761758
7 0.757790
8 0.759629
9 [31m0.763490[0m
10 0.761580
11 0.761660
12 0.762541
13 0.761518
14 0.754403
15 0.762266
16 0.758283
17 0.761585
18 0.760387
19 0.760291
20 0.760000
21 0.760883
22 0.761406
23 0.761381
24 0.757761
25 0.759871
26 0.757934
27 0.761508
28 0.759333
29 0.760935
30 0.759978
31 0.760830
32 0.755960
33 0.759678
34 0.760109
35 0.759323
36 0.759856
37 0.756617
38 0.759054
39 0.759449
40 0.760056
41 0.762122
42 0.762468
43 0.759640
44 0.757213
45 0.761593
46 0.759633
47 0.762071
48 0.760818
49 0.761821
50 0.760405
51 0.763151
52 0.758293
53 0.761369
54 0.762555
55 0.761665
56 0.760186
57 0.759711
58 0.758415
59 0.759490
60 0.759560
61 0.759374
62 0.759700
63 0.756847
64 0.760245
65 0.761260
66 0.760347
67 0.756851
68 0.759542
69 0.756289
70 0.756910
71 0.760083
72 0.761856
73 0.761873
74 0.761480
75 0.760382
76 0.760989
77 0.759864
78 0.755864
79 0.757858
80 0.761213
81 0.762188
82 0.762

In [41]:
submission(preprocess(test,'test'),[ransac_model],f'ransac_')

array([0.60888022, 0.80811137, 0.53799826, ..., 0.33056721, 0.45142457,
       0.64186668])

In [5]:
train.columns

Index(['index', 'QaA', 'QaE', 'QbA', 'QbE', 'QcA', 'QcE', 'QdA', 'QdE', 'QeA',
       'QeE', 'QfA', 'QfE', 'QgA', 'QgE', 'QhA', 'QhE', 'QiA', 'QiE', 'QjA',
       'QjE', 'QkA', 'QkE', 'QlA', 'QlE', 'QmA', 'QmE', 'QnA', 'QnE', 'QoA',
       'QoE', 'QpA', 'QpE', 'QqA', 'QqE', 'QrA', 'QrE', 'QsA', 'QsE', 'QtA',
       'QtE', 'age_group', 'education', 'engnat', 'familysize', 'gender',
       'hand', 'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04',
       'tp05', 'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'voted',
       'wf_01', 'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05',
       'wr_06', 'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'],
      dtype='object')

In [14]:
df = DataFrame()
for i in range(1,8):
    df = pd.concat([df, train[train['tp'+f'{i:02d}']==7]],axis=0)
df

Unnamed: 0,index,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,...,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13
96,96,1.0,561,1.0,2176,5.0,975,1.0,668,2.0,...,0,1,0,1,1,0,1,0,1,1
153,153,5.0,499,5.0,1557,5.0,1009,1.0,2636,5.0,...,1,1,1,1,1,0,1,1,1,1
168,168,3.0,1455,1.0,3144,1.0,973,1.0,1265,3.0,...,1,1,0,1,1,0,1,0,1,1
253,253,3.0,514,2.0,2151,3.0,2532,1.0,2097,2.0,...,1,0,1,1,1,1,1,0,1,1
320,320,2.0,1328,1.0,2907,5.0,2420,1.0,1770,5.0,...,1,1,1,0,1,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45106,45106,3.0,360,5.0,861,3.0,886,3.0,2507,1.0,...,0,0,0,1,1,0,1,0,1,0
45194,45194,1.0,775,5.0,1729,5.0,928,1.0,1271,1.0,...,1,1,0,1,1,0,1,0,1,1
45224,45224,1.0,197,5.0,401,5.0,601,1.0,452,1.0,...,1,1,0,1,1,1,1,1,1,1
45382,45382,2.0,535,4.0,1870,4.0,889,1.0,1126,5.0,...,1,1,0,1,1,0,1,1,1,1
