In [20]:
import feather
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

from numba import jit

from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, hstack

from ml_toolbox.xgboostmonitor_utils import *
import ml_toolbox.xgboostmonitor_utils as xgbm

%matplotlib inline

import xgboost as xgb
import subprocess

# Custom modules
import const
import func

In [21]:
pd.set_option('precision', 3)

## Load data

In [22]:
# Load look-up table
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)

Unnamed: 0,line,station,feature_nr,feat_nr_dat,name_dat,name_cat,name_num,col_dat,col_num,col_cat,station_V2,line_V2
0,0,0,0,1.0,L0_S0_D1,,L0_S0_F0,0.0,0.0,,0.0,1.0
1,0,0,2,3.0,L0_S0_D3,,L0_S0_F2,1.0,1.0,,0.0,1.0
2,0,0,4,5.0,L0_S0_D5,,L0_S0_F4,2.0,2.0,,0.0,1.0


In [23]:
print const.TRAIN_FILES

['train_numeric', 'train_categorical_to_num', 'train_date']


In [24]:
# Load raw numeric features
num_data = func.load_data_file(const.TRAIN_FILES[0])
X_num = num_data['data']['features']
y = num_data['data']['y']
ids = num_data['data']['ids']

del num_data

Returning <open file '/Volumes/My Book/kaggle_bosch/train_numeric.pkl', mode 'rb' at 0x11b08ee40>.pkl


In [25]:
# Load raw categorical features
cat_data = func.load_data_file(const.TRAIN_FILES[1])
X_cat = cat_data['data']['features']

del cat_data

Returning <open file '/Volumes/My Book/kaggle_bosch/train_categorical_to_num.pkl', mode 'rb' at 0x11b08ee40>.pkl


In [26]:
# Load raw date features
dat_data = func.load_data_file(const.TRAIN_FILES[2])
X_dat = dat_data['data']['features']

del dat_data

Returning <open file '/Volumes/My Book/kaggle_bosch/train_date.pkl', mode 'rb' at 0x11b08ee40>.pkl


In [27]:
# Load source and destination stations
source_stations = pd.read_csv(os.path.join(const.DATA_PATH, 'feat_set_source_station.csv'), index_col='ID')
destination_stations = pd.read_csv(os.path.join(const.DATA_PATH, 'feat_set_destination_station.csv'), index_col='ID')

In [7]:
source_stations.head(1)

Unnamed: 0_level_0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0,50.0,51.0
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,-1.0,0.0,1.0,,2.0,,,4.0,7.0,,...,,,,,,,,,,


In [13]:
# Store station 32
for s in [30, 32, 34, 35, 36]:
    print('Storing {:d}'.format(s))
    pd.get_dummies(source_stations['{:d}.0'.format(s)], 
                   'S{:d}_source'.format(s)).astype(int).to_csv('feat_set_source_station_S{:d}.csv'.format(s), 
                                                                index_col='ID')
    pd.get_dummies(destination_stations['{:d}.0'.format(s)], 
                   'S{:d}_dest'.format(s)).astype(int).to_csv('feat_set_destination_station_S{:d}.csv'.format(s), 
                                                              index_col='ID')
    
    tmp = source_stations['{:d}.0'.format(s)].astype(str) + '-' + destination_stations['{:d}.0'.format(s)].astype(str)
    tmp.value_counts()
    #pd.get_dummies(tmp, 'S{:d}_comb'.format(s)).astype(int).to_csv('feat_set_comb_station_S{:d}.csv'.format(s), 
    #                                                          index_col='ID')

Storing 30
Storing 32
Storing 34
Storing 35
Storing 36


In [19]:
# Store station 32
for s in [24.311, 27.0, 29.0, 30.0, 32.0, 34.0, 35.0, 36.0, 37.0]:
    print('Storing {}'.format(s))
    
    tmp = source_stations['{}'.format(s)].astype(str) + '-' + destination_stations['{}'.format(s)].astype(str)
    val_cnt = tmp.value_counts()
    tmp[tmp.isin(val_cnt[val_cnt<740].index)] = '1'
    print tmp.value_counts()
    
    pd.get_dummies(tmp, 'S{}_comb'.format(s)).astype(int).to_csv('feat_set_comb_station_S{}.csv'.format(s), 
                                                              index_col='ID')

Storing 24.311
nan-nan        2102717
24.31-26.0       93758
24.309-26.0      93520
24.31-27.0       25160
24.309-27.0      25103
24.309-29.0      10814
24.31-29.0       10749
24.309-28.0       2010
24.31-28.0        2007
1                 1657
Name: 24.311, dtype: int64
Storing 27.0
nan-nan        2127210
24.311-29.0      49395
25.11-29.0       30688
11.0-29.0        26962
9.0-29.0         26935
10.0-29.0        26804
24.111-29.0      25974
24.211-29.0      10411
25.23-29.0        9439
21.0-29.0         6885
22.0-29.0         6883
23.0-29.0         6818
1                 4942
25.22-29.0        2304
25.21-29.0        2182
24.211-39.0       1862
-1.0-29.0         1046
9.0-39.0           755
Name: 27.0, dtype: int64
Storing 29.0
26.0-30.0      441534
11.0-30.0      363976
9.0-30.0       363498
10.0-30.0      361767
27.0-30.0      233009
21.0-30.0      137115
22.0-30.0      135693
23.0-30.0      135496
nan-nan        128429
24.311-30.0     21537
28.0-30.0       18483
25.11-30.0      14100

## Commonly used function

In [8]:
def most(x):
    ''' Returns most common element of series x'''
    
    return x.value_counts().idxmax() if x.value_counts().shape[0]>0 else None

def least(x):
    ''' Returns least common element of series x'''
    
    return x.value_counts().idxmin() if x.value_counts().shape[0]>0 else None

def nth_most_common(x, n):
    ''' Returns nth most common element of series x'''
    
    return x.value_counts().index[n] if x.value_counts().shape[0]>n else None

def err_rate(x):
    ''' Return mean multiplied by 100'''
    return x.mean()*100

## Analyze error rate per source/destination path

In [9]:
# Output all
for station in source_stations.columns:
    s = source_stations.loc[ids.Id, str(station)]
    d = destination_stations.loc[ids.Id, str(station)]
    
    # Create a dataframe
    df = pd.DataFrame({'s':s, 'd':d, 'r':y.Response})
    
    # Transform to feature according following rules
    # sum r < 10: s: 1 d: 1
    # sum r >= 10:
    #   err rate: <0.45: s: 2 d: 2
    #   err rate: >=0.45 <0.9 s: 3 d:3
    #   err rate  >=0.9 actual
    df_agg = df.groupby(['s','d']).agg([err_rate,'count','sum'])
    df_agg.columns = df_agg.columns.droplevel()
    df_agg.reset_index(inplace=True)
    
    #df_agg.loc[df_agg['sum']<10,['s','d']] = 1
    #df_agg.loc[(df_agg['sum']>=10) & (df_agg['err_rate']<0.9), ['s','d']] = 3
    #df_agg.loc[(df_agg['sum']>=10) & (df_agg['err_rate']<0.45), ['s','d']] = 2
    #df_agg = df_agg.groupby(['s','d']).agg({'err_rate':'mean','sum':'sum','count':'sum'})
    
    if df_agg.shape[0]>0:
        #df_agg[(df_agg['sum']>=10) & (df_agg['err_rate']0.45)][['s','d']]==2
        # Print those where there are significant difference between source/destination station
        print('Analyzing station {}'.format(station))
        #print df.groupby(['s','d']).agg([err_rate,'count','sum'])
        print df_agg
        print('')

Analyzing station 0.0
     s    d  err_rate   count   sum
0 -1.0  1.0     0.535  673063  3604
1 -1.0  2.0     0.941     425     4
2 -1.0  3.0     0.000     372     0
3 -1.0  4.0     0.000       2     0

Analyzing station 1.0
     s     d  err_rate   count   sum
0 -1.0   2.0     0.245     408     1
1 -1.0   3.0     0.232     431     1
2 -1.0   4.0     0.000       2     0
3  0.0   2.0     0.535  338937  1812
4  0.0   3.0     0.536  333623  1787
5  0.0   4.0     0.862     232     2
6  0.0   5.0     1.128     266     3
7  0.0  29.0     0.000       5     0

Analyzing station 2.0
      s     d  err_rate   count  sum
0  -1.0   4.0     0.000       3    0
1  -1.0   5.0     0.000       1    0
2   0.0   4.0     1.015     197    2
3   0.0   5.0     0.877     228    2
4   1.0   3.0     1.423     281    4
5   1.0   4.0     0.536  168601  904
6   1.0   5.0     0.531  170446  905
7   1.0   6.0     0.000       2    0
8   1.0   7.0     0.000       3    0
9   1.0   8.0     0.000       1    0
10  1.0   9.

In [14]:
# Filter and group paths with low count
for station in source_stations.columns:
    s = source_stations.loc[ids.Id, str(station)]
    d = destination_stations.loc[ids.Id, str(station)]
    
    # Create a dataframe
    df = pd.DataFrame({'s':s, 'd':d, 'r':y.Response}, index=ids.Id)
    
    # Transform to feature according following rules
    # sum r < 10: s: 1 d: 1
    # sum r >= 10:
    #   err rate: <0.45: s: 2 d: 2
    #   err rate: >=0.45 <0.9 s: 3 d:3
    #   err rate  >=0.9 actual
    df_agg = df.groupby(['s','d']).agg([err_rate,'count','sum'])
    df_agg.columns = df_agg.columns.droplevel()
    df_agg.reset_index(inplace=True)
    
    df_agg['val'] = df_agg['s'].astype(str) + '-' + df_agg['d'].astype(str)
    df_agg.loc[df_agg['sum']<10,['val']] = '1'
    df_agg.loc[(df_agg['sum']>=10) & (df_agg['err_rate']<0.9), ['val']] = '3'
    df_agg.loc[(df_agg['sum']>=10) & (df_agg['err_rate']<0.45), ['val']] = '2'
    df_agg_fin = df_agg.groupby(['val']).agg({'err_rate':'mean','sum':'sum','count':'sum'})
    
    
    if df_agg_fin.shape[0]>2:
        #df_agg[(df_agg['sum']>=10) & (df_agg['err_rate']0.45)][['s','d']]==2
        # Print those where there are significant difference between source/destination station
        print('Analyzing station {}'.format(station))
        #print df.groupby(['s','d']).agg([err_rate,'count','sum'])
        print df_agg_fin
        print('')
        
        # Store result
        #tmp = df.set_index(['s','d']).merge(df_agg.set_index(['s','d'])[['val']], left_index=True, right_index=True, how='left')
        #pd.get_dummies(tmp, 'S{:d}_path_'.format(station)).astype(int).to_csv('feat_set_comb_station_S{:d}.csv'.format(s), 
        #                                                      index_col='ID')

Analyzing station 9.0
      count   sum  err_rate
val                        
1      2337    12     1.934
2     11304    45     0.398
3    212038  1139     0.711

Analyzing station 10.0
      count   sum  err_rate
val                        
1      3636    21     3.434
2     11169    45     0.403
3    209735  1157     0.703

Analyzing station 21.0
     count  sum  err_rate
val                      
1      808    4     0.258
2     5498   23     0.418
3    75103  434     0.746

Analyzing station 22.0
           count  sum  err_rate
val                            
1            799    1     0.035
2           5367   24     0.447
20.0-27.0   3549   38     1.071
3          70886  365     0.512

Analyzing station 23.0
           count  sum  err_rate
val                            
1            860    5     1.504
20.0-27.0   3440   36     1.047
3          75990  393     0.656

Analyzing station 24.111
             count  sum  err_rate
val                              
1              662   10   

In [29]:
df_agg[['val']].head(20)

Unnamed: 0,val
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [27]:
df.set_index(['s','d']).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,r
s,d,Unnamed: 2_level_1
,,0
,,0
,,0
,,0
,,0
,,0
,,0
,,0
,,0
,,0


In [53]:
df.groupby(['s','d']).agg([err_rate,'count','sum'])

Unnamed: 0_level_0,Unnamed: 1_level_0,r,r,r
Unnamed: 0_level_1,Unnamed: 1_level_1,err_rate,count,sum
s,d,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
48.0,-1.0,0.0,4,0
49.0,-1.0,0.488,29512,144
50.0,-1.0,0.521,30337,158


In [56]:
df_agg = df.groupby(['s','d']).agg([err_rate,'count','sum'])
df_agg.columns = df_agg.columns.droplevel()
df_agg.reset_index(inplace=True)
df_agg.loc[df_agg['sum']<10,['s','d']] = 1
df_agg.loc[(df_agg['sum']>=10) & (df_agg['err_rate']<0.9), ['s','d']] = 3
df_agg.loc[(df_agg['sum']>=10) & (df_agg['err_rate']<0.45), ['s','d']] = 2

In [59]:
df_agg.groupby(['s','d']).agg({'err_rate':'mean','sum':'sum','count':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,count,sum,err_rate
s,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,1.0,4,0,0.0
3.0,3.0,59849,302,0.504


In [57]:
df_agg.head()

Unnamed: 0,s,d,err_rate,count,sum
0,1.0,1.0,0.0,4,0
1,3.0,3.0,0.488,29512,144
2,3.0,3.0,0.521,30337,158


## Analyze numeric values per station

In [51]:
station = 28.

cols_num = [int(x) for x in lut[lut['station_V2']==station].col_num if not np.isnan(x)]

print('Analyzing station {} ({} features)'.format(station, len(cols_num)))
print('')

for col_num in cols_num:
    
    print('Feature num: {}'.format(col_num))

    # Get numeric values for this feature
    f_num = X_num[:, col_num].todense().A1
    f_num[f_num==0] = np.nan

    s = source_stations.loc[ids.Id, str(station)]
    d = destination_stations.loc[ids.Id, str(station)]


    # Create a dataframe
    df = pd.DataFrame({'num': f_num,'s':s, 'd':d, 'r':y.Response})

    print(df.groupby(['s','d']).agg({'num':['mean','std','min','max', 'skew'], 'r': [err_rate,'count','sum']}))
    print('')

Analyzing station 28.0 (14 features)

Feature num: 709
                    r                  num                            
             err_rate count sum       mean    std    min    max   skew
s       d                                                             
-1.000  29.0    0.000   150   0 -1.361e-01  0.028 -0.212 -0.064 -0.087
 9.000  29.0    0.907  1103  10  1.691e-02  0.216 -0.405  0.521 -0.346
        39.0    0.000     9   0  2.336e-01  0.017  0.200  0.256 -0.796
 10.000 29.0    0.369  1084   4  1.495e-02  0.222 -0.399  0.517 -0.283
        39.0    0.000    20   0  2.490e-01  0.022  0.199  0.291 -0.080
 11.000 29.0    0.547  1096   6 -9.954e-04  0.224 -0.399  0.504 -0.250
        39.0    0.000    19   0  2.422e-01  0.022  0.209  0.294  0.600
 21.000 29.0    0.664   301   2 -4.219e-03  0.201 -0.378  0.461  0.112
        39.0    0.000     4   0  2.592e-01  0.035  0.215  0.296 -0.534
 22.000 29.0    0.000   276   0 -9.384e-03  0.195 -0.374  0.481  0.263
        39.0    0.000 

## Analyze categorical value per station

In [31]:
for station in lut['station_V2'].unique():

    cols_cat = [int(x) for x in lut[lut['station_V2']==station]['col_cat'] if not np.isnan(x)]

    print('Analyzing station {} ({} features)'.format(station, len(cols_cat)))
    print('')

    for col_cat in cols_cat:


        f_nr = lut[lut['col_cat']==col_cat]['name_cat']

        print('Feature num: {}'.format(f_nr))

        # Get numeric values for this feature
        f_cat = X_cat[:, col_cat].todense().A1
        f_cat[f_cat==0] = np.nan

        s = source_stations.loc[ids.Id, str(station)]
        d = destination_stations.loc[ids.Id, str(station)]


        # Create a dataframe
        df = pd.DataFrame({'cat': f_cat,'s':s, 'd':d, 'r':y.Response})
        df_agg = df.groupby(['s','d']).agg({'cat':['mean','std','min','max', most, least], 'r': [err_rate,'count','sum']})
        print(df_agg[df_agg['r']['count']>500])
        print('')

Analyzing station 0.0 (0 features)

Analyzing station 1.0 (4 features)

Feature num: 13    L0_S1_F25
Name: name_cat, dtype: object
               r                cat                          
        err_rate   count   sum mean  std  min  max most least
s   d                                                        
0.0 2.0    0.535  338937  1812  1.0  0.0  1.0  1.0  1.0   1.0
    3.0    0.536  333623  1787  1.0  0.0  1.0  1.0  1.0   1.0

Feature num: 14    L0_S1_F27
Name: name_cat, dtype: object
               r                cat                          
        err_rate   count   sum mean  std  min  max most least
s   d                                                        
0.0 2.0    0.535  338937  1812  9.0  0.0  9.0  9.0  9.0   9.0
    3.0    0.536  333623  1787  9.0  0.0  9.0  9.0  9.0   9.0

Feature num: 16    L0_S1_F29
Name: name_cat, dtype: object
               r                cat                          
        err_rate   count   sum mean  std  min  max most least
s   d

In [None]:
# Interesting: L2_S26_F3099, L2_S27_F3192, L2_S28_F3285, L3_S30_F3498, L3_S30_F3500, L3_S30_F3503, 
# L3_S30_F3505, L3_S30_F3508, L3_S30_F3510, L3_S30_F3513, L3_S30_F3518, L3_S30_F3523, L3_S32_F3854, L3_S36_F3939

## Analyze date feature per station

In [33]:
station = 24.311

cols_dat = [int(x) for x in lut[lut['station_V2']==station].col_dat if not np.isnan(x)]

print('Analyzing station {} ({} features)'.format(station, len(cols_dat)))
print('')

for col_dat in cols_dat:
    
    print('Feature num: {}'.format(col_dat))

    # Get numeric values for this feature
    f_dat = X_dat[:, col_dat].todense().A1
    f_dat[f_dat==0] = np.nan

    s = source_stations.loc[ids.Id, str(station)]
    d = destination_stations.loc[ids.Id, str(station)]


    # Create a dataframe
    df = pd.DataFrame({'dat': f_dat,'s':s, 'd':d, 'r':y.Response})

    print(df.groupby(['s','d']).agg({'dat':['mean','std','min','max', 'skew'], 'r': [err_rate,'count','sum']}))
    print('')
    
    # For dates all features are some, so we can stop after one
    break

Analyzing station 24.311 (14 features)

Feature num: 458
                     dat                                          r         \
                    mean      std      min      max   skew err_rate  count   
s      d                                                                     
9.000   29.000  1651.030    0.000  1651.03  1651.03    NaN    0.000      2   
10.000  26.000  1457.040      NaN  1457.04  1457.04    NaN    0.000      1   
        29.000  1651.032    0.005  1651.03  1651.04  2.073    0.000      4   
11.000  29.000  1651.010      NaN  1651.01  1651.01    NaN    0.000      1   
24.304  26.000   751.680      NaN   751.68   751.68    NaN    0.000      1   
24.309 -1.000   1161.665  481.137   821.45  1501.88    NaN   50.000      2   
        25.100   642.421  322.840   339.64  1055.69  0.363    8.333     24   
        25.211  1158.840      NaN  1158.84  1158.84    NaN  100.000      1   
        25.212   532.412  349.860   375.95  1158.26  2.236    0.000      5   
       