In [1]:
%matplotlib inline
import os, sys, time
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from __future__ import print_function

In [2]:
# load the raw data
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
print('Shape of the training dataset: {}'.format(df_train.shape))
print('Shape of the test dataset: {}'.format(df_test.shape))
df_train.head()

Shape of the training dataset: (595212, 59)
Shape of the test dataset: (892816, 58)


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [3]:
# is there class imbalance? The answer is very much...
df_train['target'].value_counts()

0    573518
1     21694
Name: target, dtype: int64

In [4]:
# what features are we looking at?
# there are 57 features in total (=59-2)
all_features = df_train.columns.tolist()
all_features.remove('target')
all_features.remove('id')
bin_features = []
cat_features = []
other_features = []
for col in df_train.columns[2:]:  # exclude 'id' and 'target'
    if col.split('_')[-1] == 'bin':
        bin_features.append(col)
    elif col.split('_')[-1] == 'cat':
        cat_features.append(col)
    else:
        other_features.append(col)

In [5]:
len(bin_features)

17

In [6]:
len(cat_features)

14

In [7]:
# how many missing values are we looking at?
# percentage by classes
df_tmp = df_train[['target']+all_features].groupby('target')[all_features]\
.apply(lambda x: 100*np.sum(x == -1)/len(x)).T
# percentage by all classes
df_tmp2 = (df_train[all_features].apply(lambda x: x == -1).sum() / df_train.shape[0] * 100)\
.to_frame()
df_tmp2.rename(columns={0: 'total'}, inplace=True)
# merge!
df_tmp = df_tmp.merge(df_tmp2, left_index=True, right_index=True)
del df_tmp2
df_tmp.sort_values(by='total', ascending=False, inplace=True)
df_tmp.head()

Unnamed: 0,0,1,total
ps_car_03_cat,69.358067,61.998709,69.089837
ps_car_05_cat,45.002772,38.960081,44.782531
ps_reg_03,18.25784,14.105283,18.10649
ps_car_14,7.130901,7.942288,7.160474
ps_car_07_cat,1.846673,4.139393,1.930237


## Dealing with missing values
Before we dive deep into the model, we need to deal with the missing values first. The simpliest method is just to ignore all the samples that as **any** missing value in there. However, by doing so, we reduce the total sample size from ~ 600k to less than 100k. 

Therefore, we need a better way to impute the missing values. Jingfei's current methodolgy is to treat the missing value as a new category, for the cases of binary and categorical features. (How about continuous features?)

Another possibility is to geuss the missing values intelligently. To start, we can fill the missing binary and categorical values with the most frequency entries, and fill the missing continuous with their medians.

There are much more can be done with regard to data imputation: for example, to train regression models on the missing variable, from other varibles. However, we need to keep in mind how to carry the regression to test data or new data, with missing entries. 

We have to pay special attention to those features with many missing values, namely, `ps_car_03_cat`, `ps_cat_05_cat`, `ps_reg_03`, etc.

### Metric

When comparing different imputation methods, we need have a unified metric to evaluate their success. The simplest metric can be the testing error - same as how we evaluate different models, but in this case, the model should stay the same. The default choice will be xgboost.

In [8]:
# replace -1 with nan for easier handling
df_train.replace({-1: np.float('NaN')}, inplace=True)

### Method 1: mode and median
In the simplest case, let's fill the missing binary / categorical values with their mode, and continoues variables with their median.

In [9]:
fills = {}  # make the dict so that we can fill test data
for c in df_train.columns[2:]:
    if c.endswith(('bin', 'cat')):
        fills[c] = df_train[c].mode()[0]
    else: 
        fills[c] = df_train[c].median()
df_train_impute_1 = df_train.fillna(fills)
df_test_impute_1 = df_test.fillna(fills)

In [10]:
%run '../py/models_ccy.py'
clf = my_xgb(df_train_impute_1[all_features], 
             df_train_impute_1['target'].astype(int, inplace=True))
clf.input_scaling()

Splitted training/test, and applied standard scaler.
Total number of training samples: 476169


In [11]:
clf.fit()



class weight is: 26
[0]	train-gini:0.227972	valid-gini:0.199426
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.410172	valid-gini:0.278516
[200]	train-gini:0.496464	valid-gini:0.273532
Stopping. Best iteration:
[108]	train-gini:0.417843	valid-gini:0.279451

class weight is: 26
[0]	train-gini:0.235514	valid-gini:0.177081
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.414915	valid-gini:0.264368
[200]	train-gini:0.494858	valid-gini:0.261383
Stopping. Best iteration:
[102]	train-gini:0.416635	valid-gini:0.264705

class weight is: 26
[0]	train-gini:0.232941	valid-gini:0.19843
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.409771	valid-gini:0.26

[[0.49275228,
  0.3952139,
  0.44457743,
  0.38601995,
  0.51504874,
  0.24977329,
  0.346508,
  0.22830074,
  0.41809183,
  0.22086844,
  0.3818714,
  0.3290292,
  0.44831023,
  0.36122799,
  0.54664618,
  0.41294217,
  0.43500444,
  0.29922941,
  0.40096989,
  0.45494899,
  0.32034487,
  0.22252688,
  0.44073802,
  0.58820611,
  0.52586806,
  0.47213647,
  0.3915655,
  0.27095994,
  0.49400902,
  0.52204895,
  0.49422154,
  0.40196195,
  0.61050409,
  0.69032371,
  0.4787623,
  0.46116045,
  0.34536833,
  0.63142592,
  0.44585261,
  0.48952681,
  0.46197391,
  0.50591481,
  0.37942967,
  0.55994046,
  0.48449552,
  0.60591805,
  0.45757186,
  0.37442774,
  0.61657226,
  0.57476634,
  0.63169938,
  0.54840487,
  0.43650064,
  0.47918549,
  0.34840107,
  0.71812296,
  0.44663131,
  0.63592368,
  0.44391158,
  0.45548919,
  0.3482686,
  0.52125704,
  0.56534058,
  0.65490329,
  0.5542407,
  0.53777122,
  0.40371272,
  0.48724622,
  0.36859149,
  0.50321221,
  0.56379879,
  0.32098237,
 

## Feature engineering