EDA on Springleft Kaggle Challenge 
===

[1. Read the Data](#Read-the-Data)  
[2. Data Overview](#Data-Overview)  
[3. Dataset Cleaning](#Dataset-Cleaning)  
[4. Determin Types](#Determin-Types)  
[5. Go Through](#Go-Through)  
[6. Categorical Features](#Categorical-Features)  

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import seaborn

In [2]:
# label the text on figure
def autolabel(arrayA):
    arrayA = np.array(arrayA)
    for i in range(arrayA.shape[0]):
        for j in range(arrayA.shape[1]):
            plt.text(j, i, "%2f" %arrayA[i, j], ha='center', va='bottom', color='w')

# plot hist of two features
def hist_it(feat):
    plt.figure(figsize=(16,4))
    feat[Y==0].hist(bins=range(int(feat.min()), int(feat.max()+2)), normed=True, alpha=0.8)
    feat[Y==1].hist(bins=range(int(feat.min()), int(feat.max()+2)), normed=True, alpha=0.5)
    plt.ylim((0,1))

def gt_matrix(feats, sz=16):
    a = []
    for i, c1 in enumerate(feats):
        b = []
        for j, c2 in enumerate(feats):
            mask = (~train[c1].isnull()) & (~train[c2].isnull())
            if i >= j:
                b.append((train.loc[mask,c1].values >= train.loc[mask, c2].values).mean())
            else:
                b.append((train.loc[mask,c1].values > train.loc[mask, c2].values).mean())
        a.append(b)
    
    plt.figure(figsize=(sz,sz))
    plt.imshow(a, interpolation='None')
    _ = plt.xticks(range(len(feats)), feats, rotation = 90)
    _ = plt.yticks(range(len(feats)), feats, rotation = 0)
    autolabel(a)

In [3]:
def hist_it1(feat):
    plt.figure(figsize=(16,4))
    feat[Y==0].hist(bins=100,range=(feat.min(),feat.max()),normed=True,alpha=0.5)
    feat[Y==1].hist(bins=100,range=(feat.min(),feat.max()),normed=True,alpha=0.5)
    plt.ylim((0,1))

## Read the Data

In [4]:
train = pd.read_csv('data/springleaf/train.csv')
Y = train.target

test = pd.read_csv('data/springleaf/test.csv')
test_ID = test.ID

## Data Overview

In [5]:
print('Train shape :', train.shape)
print('Test shape :', test.shape)

Train shape : (145231, 1934)
Test shape : (145232, 1933)


In [6]:
train.head()

Unnamed: 0,ID,VAR_0001,VAR_0002,VAR_0003,VAR_0004,VAR_0005,VAR_0006,VAR_0007,VAR_0008,VAR_0009,...,VAR_1926,VAR_1927,VAR_1928,VAR_1929,VAR_1930,VAR_1931,VAR_1932,VAR_1933,VAR_1934,target
0,2,H,224,0,4300,C,0.0,0.0,False,False,...,98,98,998,999999998,998,998,9998,9998,IAPS,0
1,4,H,7,53,4448,B,1.0,0.0,False,False,...,98,98,998,999999998,998,998,9998,9998,IAPS,0
2,5,H,116,3,3464,C,0.0,0.0,False,False,...,98,98,998,999999998,998,998,9998,9998,IAPS,0
3,7,H,240,300,3200,C,0.0,0.0,False,False,...,98,98,998,999999998,998,998,9998,9998,RCC,0
4,8,R,72,261,2000,N,0.0,0.0,False,False,...,98,98,998,999999998,998,998,9998,9998,BRANCH,1


In [7]:
test.head()

Unnamed: 0,ID,VAR_0001,VAR_0002,VAR_0003,VAR_0004,VAR_0005,VAR_0006,VAR_0007,VAR_0008,VAR_0009,...,VAR_1925,VAR_1926,VAR_1927,VAR_1928,VAR_1929,VAR_1930,VAR_1931,VAR_1932,VAR_1933,VAR_1934
0,1,R,360,25,2251,B,2.0,2.0,False,False,...,0,98,98,998,999999998,998,998,9998,9998,IAPS
1,3,R,74,192,3274,C,2.0,3.0,False,False,...,0,98,98,998,999999998,998,998,9998,9998,IAPS
2,6,R,21,36,3500,C,1.0,1.0,False,False,...,0,98,98,998,999999998,998,998,9998,9998,IAPS
3,9,R,8,2,1500,B,0.0,0.0,False,False,...,0,98,98,998,999999998,998,998,9998,9998,IAPS
4,10,H,91,39,84500,C,8.0,3.0,False,False,...,0,98,98,998,999999998,998,998,9998,9998,IAPS


In [8]:
# number of nans for each object
train.isnull().sum(axis=1).head(15)

0     25
1     19
2     24
3     24
4     24
5     24
6     24
7     24
8     16
9     24
10    22
11    24
12    17
13    24
14    24
dtype: int64

In [9]:
# number of nans for each column
train.isnull().sum(axis=0).head(15)

ID           0
VAR_0001     0
VAR_0002     0
VAR_0003     0
VAR_0004     0
VAR_0005     0
VAR_0006    56
VAR_0007    56
VAR_0008    56
VAR_0009    56
VAR_0010    56
VAR_0011    56
VAR_0012    56
VAR_0013    56
VAR_0014    56
dtype: int64

## Dataset Cleaning

### Remove constant features
#### convenient to concatenate train and test into one dataframe and do all feature engineering using it.

In [10]:
trainset = pd.concat([train, test], axis=0)

In [11]:
# dropna = False make uunique treat NaNs as a distinct value
# nunique : count distinct values of each column
feats_counts = train.nunique(dropna=False)

In [12]:
# 5 constant values... -> should be removed
feats_counts.sort_values()[:10]

VAR_0213    1
VAR_0207    1
VAR_0840    1
VAR_0847    1
VAR_1428    1
VAR_1165    2
VAR_0438    2
VAR_1164    2
VAR_1163    2
VAR_1162    2
dtype: int64

In [13]:
constant_features = feats_counts.loc[feats_counts==1].index.tolist()
print(constant_features)

trainset.drop(constant_features, axis=1, inplace=True)

['VAR_0207', 'VAR_0213', 'VAR_0840', 'VAR_0847', 'VAR_1428']


### Remove duplicated features

In [16]:
trainset.fillna('NaN', inplace = True)

In [None]:
# encode each features
train_enc = pd.DataFrame(index = train.index)

for col in tqdm_notebook(trainset.columns):
    train_enc[col] = train[col].factorize()[0]

In [None]:
dup_cols = {}

for i, c1 in enumerate(tqdm_notebook(train_enc.columns)):
    for c2 in train_enc.columns[i + 1:]:
        if c2 not in dup_cols and np.all(train_enc[c1] == train_enc[c2]):
            dup_col[c2] = c1

In [None]:
dup_cols

In [None]:
# save the results because it takes a lot of time
import cPickle as pickle
pickle.dump(dup_cols, open('dup_cols.p', 'w'), protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# drop from trainset
trainste.drop(dup_cols.keys(), axis=1, inplace=True)

## Determin Types

In [None]:
# examine the number of unique values
# how many unique values on each columns?
nunique = train.nunique(dropna=True)
nunique

In [None]:
# build histogram
plt.figure(figsize=(14,6))
_ = plt.hist(nunique.astype(float)/train.shape[0], bins=100)

In [None]:
# find the columns with huge number of unique values
# all integers -> probably counts
mask = (nunique.astype(float) / train.shape[0] > 0.8)
train.loc[:, mask]

In [None]:
# no float values, all integers -> count of something
# 23th line with 9999, -99999 values looks like NaNs
# sequential column names -> some columns are grouped together
mask = (unique.astype(float) / train.shape[0] < 0.8) & (nunique.astype(float) / train.shape[0] > 0.4)
train.loc[:25, mask]

In [None]:
train['VAR_0015'].value_counts()

In [None]:
# split column types by data types
cat_cols = list(train.select_dtypes(include=['object']).columns)
num_cols = list(train.select_dtypes(include=['object']).columns)

## Go Through

In [None]:
# replace NaNs with -999
train.replace('NaN', -999, inplace = True)

In [None]:
# calculate how many time one feature is greater than the other
# select first 42 numeric features
feats = num_cols[:42]

# build 'mean(feat1 > feat2)' plot
gt_matrix(feats, 16)

# each column corresponds to cumulative counts
# ex) feature number one is counts in first month, second total count number in first tow month
# should make feature -> differences between consecutive values

In [None]:
# checking out var_0002, var_0003 features
hist_it(train['VAR_0002'])
plt.ylim((0,0,.05))
plt.xlim((-10,1010))

hist_it(train['VAR_0003'])
plt.ylim((0,0,.03))
plt.xlim((-10, 1010))

In [None]:
train['VAR_0002'].value_counts()

In [None]:
train['VAR_0003'].value_counts()

In [None]:
# checking out var_0004
train['VAR_0004_mod50'] = train['VAR_0004'] % 50
hist_it(train['VAR_0004_mod50'])
plt.ylim((0, 0.6))

## Categorical Features

In [None]:
# feature 200, 237, 274 looks like geogrohical data -> make use of it
train.loc[:, ,cat_cols].head().T

In [None]:
# datetime columns can be used too
date_cols = [u'VAR_0073','VAR_0075',
             u'VAR_0156',u'VAR_0157',u'VAR_0158','VAR_0159',
             u'VAR_0166', u'VAR_0167',u'VAR_0168',u'VAR_0169',
             u'VAR_0176',u'VAR_0177',u'VAR_0178',u'VAR_0179',
             u'VAR_0204',
             u'VAR_0217']
             
for c in date_cols:
    train[c] = pd.to_datetime(train[c], format='%d%b%y:%H:%M:%S')
    test[c] = pd.to_datetime(test[c], format='%d%b%y:%H:%M:%S')

In [None]:
c1 = 'VAR_0217'
c2 = 'VAR_0073'

# one date is strictly greater than the other
# so the difference can be a good feature
# horizontal line looks like NaN -> create a new binary feature 
# which will serve as an indicator that our time feature is NaN
mask = (~train[c1].isnull()) & (~train[c2].isnull())
sc2(train.loc[mask, c1].values, train.loc[mask, c2].values, c=train.loc[mask, 'target'].values)