## Builds a model for vectorizing the raw data (apply it once on train and once on test) :
* pivot from the initial feature_name:feature_value form to a vector
* handle dummy variables: translate categoric variables into N-1 dummy variables (The model is based on categories in train data)
* handle time-series variables: reduce them in several hard-coded methods
* fill missing values with train data means, and normalize to z-scores with train data std


In [2]:
from IPython.display import display

import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from vectorizing_funcs import *

In [3]:
df = pd.read_csv('../all_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
df.head()

Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,Demographic,Gender,F,,0.0
1,533,Demographic,Age,65,,0.0
2,533,Demographic,Race,White,,0.0
3,533,ALSHX,onset_delta,-1023,,0.0
4,533,ALSHX,diag_delta,-44,,0.0


# Build metadata: assign features to vectorizing functions
funcs_to_features arrays define pairs of funcs (can be a list of functions or a single one) and features that should get these functions calculated. Overlapping is allowed.

There is a list for time-series functions (as described before) and for dummy functions. Both are inverted to feature_to_funcs maps.

In [4]:
ts_funcs_to_features = add_frequent_lab_tests_to_ts_features(df, ts_funcs_to_features)    
all_feature_metadata = invert_func_to_features(ts_funcs_to_features, "ts")
all_feature_metadata.update(invert_func_to_features(dummy_funcs_to_features, "dummy"))

## Learn to_dummies model
Which kind of categories do we have available in our train data?

In [5]:
all_feature_metadata = learn_to_dummies_model(df, all_feature_metadata)

##Vectorize `train` data 

In [6]:

vectorized, all_feature_metadata = vectorize(df, all_feature_metadata, debug=True)
vectorized.head()


family_ALS_hist
onset_site
weight
Sodium
hands
pulse
White Blood Cell (WBC)
Albumin
Chloride
ALSFRS_Total
Glucose
Alkaline Phosphatase
bp_systolic
fvc_percent
Potassium
BMI
bp_diastolic
Blood Urea Nitrogen (BUN)
Phosphorus
CK
Gender
Age
respiratory_rate
Calcium
Race
mouth
Gamma-glutamyltransferase
Total Cholesterol
onset_delta
Bilirubin (Total)
Creatinine
temperature
respiratory
AST(SGOT)
Hemoglobin
Platelets
if_use_Riluzole
Protein
ALT(SGPT)


Unnamed: 0_level_0,family_ALS_hist_last,Bulbar,Limb,Limb and Bulbar,weight_pct_diff,weight_last,weight_mean_slope,weight_mean,Sodium_pct_diff,Sodium_last,...,Platelets_mean,if_use_Riluzole_last,Protein_pct_diff,Protein_last,Protein_mean_slope,Protein_mean,ALT(SGPT)_pct_diff,ALT(SGPT)_last,ALT(SGPT)_mean_slope,ALT(SGPT)_mean
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,,1,0,0,-0.000108,50.5,0.004337,51.0,0.000231,146,...,160.5,,-0.000157,69.0,-0.010989,69.5,0.003663,24,0.065934,21.0
649,,1,0,0,0.000156,74.4,0.035425,74.566667,0.0,136,...,259.0,1.0,0.000549,73.0,0.058039,71.0,0.007212,25,-0.072383,17.0
1234,,1,0,0,0.000227,96.0,0.012698,95.05,-0.0001,142,...,211.25,1.0,-0.001099,72.0,-0.18153,74.5,0.000621,24,-0.100577,22.75
2492,,0,1,0,0.0,90.1,0.0,90.1,0.000213,139,...,,,,,,,0.00098,62,0.058824,61.0
2956,,0,1,0,-5.9e-05,65.3,-0.032251,65.1,0.00028,142,...,283.75,0.0,0.000732,75.0,0.096681,73.5,0.001558,28,-0.050505,25.25


## Clean outliers
As our data is really messy, we must clean it from outliers, or else our models fail.
We can not clean before the vectorizing, because even if there are only sane values, the slopes and pct_diffs can still get extreme values. 
We use the robust median and MAD for location and spread, because they are less likely to be affected by the outliers.

In [7]:
train_data_medians = vectorized.median()
train_data_mads = (vectorized - train_data_medians).abs().median()
train_data_std = vectorized.std()


In [8]:
cleaned = clean_outliers(vectorized, all_feature_metadata, 
                         train_data_medians, train_data_mads, train_data_std, debug=True)


In [9]:
cleaned.describe().transpose().sort("std", ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
onset_delta_last,1973,-589.931069,288.864317,-1307.000000,-788.000000,-540.000000,-359.000000,-16.000000
CK_last,1510,247.196026,147.383385,18.000000,131.000000,215.000000,342.000000,631.000000
CK_mean,1519,237.615180,137.078750,17.666667,128.500000,209.500000,324.125000,590.500000
Platelets_last,1803,230.867443,46.716369,125.000000,198.000000,230.000000,266.000000,339.000000
Platelets_mean,1802,233.121263,45.019104,131.000000,200.270833,231.125000,265.458333,336.250000
Alkaline Phosphatase_mean,1763,72.877198,16.560573,35.000000,61.000000,72.000000,84.100000,112.000000
Alkaline Phosphatase_last,1761,72.737081,16.537788,35.000000,61.000000,72.000000,84.000000,111.000000
fvc_percent_last,1934,81.415659,15.177614,46.771130,70.713489,81.898630,92.267878,116.643357
Creatinine_last,2068,67.182578,15.104497,34.000000,55.000000,66.000000,79.560000,101.700000
Creatinine_mean,2090,68.558420,14.625058,35.360000,58.240000,68.000000,79.560000,101.660000


## Filling empty values with means and normalizing
- NOTE that we have to use the `train` data means and std

In [10]:
train_data_means = cleaned.mean()
train_data_std = cleaned.std()
normalized, all_feature_metadata = normalize(cleaned, all_feature_metadata, train_data_means, train_data_std)
normalized.describe().T.sort("max", ascending=False).head(20)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Other,2205,-4.475067e-16,1.0,-0.06033,-0.06033,-0.06033,-0.06033,16.568055
Hispanic,2205,-4.744504e-16,1.0,-0.067481,-0.067481,-0.067481,-0.067481,14.812172
Limb and Bulbar,2205,-1.305708e-16,1.0,-0.085475,-0.085475,-0.085475,-0.085475,11.694035
Asian,2205,3.104219e-16,1.0,-0.085475,-0.085475,-0.085475,-0.085475,11.694035
Black,2205,1.767042e-16,1.0,-0.113384,-0.113384,-0.113384,-0.113384,8.815596
Unknown,2205,-6.223291000000001e-17,1.0,-0.137615,-0.137615,-0.137615,-0.137615,7.263371
respiratory_pct_diff,2205,4.173658e-16,0.991342,-5.840336,0.083677,0.083677,0.083677,6.227097
mouth_pct_diff,2205,-1.248686e-17,0.990655,-5.435709,0.0,0.168922,0.168922,5.698106
respiratory_mean_slope,2205,1.755965e-17,0.987674,-5.255244,0.118722,0.118722,0.118722,5.492689
hands_pct_diff,2205,3.121464e-16,0.98353,-4.927891,-0.316207,0.243369,0.243369,5.414629


## Pickle all metadata we will need to use later when applying vectorizer

In [11]:
pickle.dump( all_feature_metadata, open('../all_feature_metadata.pickle', 'wb') )
pickle.dump( train_data_means, open('../all_data_means.pickle', 'wb') )
pickle.dump( train_data_std, open('../all_data_std.pickle', 'wb') )
pickle.dump( train_data_medians, open('../all_data_medians.pickle', 'wb') )
pickle.dump( train_data_mads, open('../all_data_mads.pickle', 'wb') )


## Apply model on `train`,  `test` 


In [12]:

for t in ["all", "test"]:
    df = pd.read_csv('../' + t + '_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
    vectorized, _ = vectorize(df, all_feature_metadata)
    cleaned = clean_outliers(vectorized, all_feature_metadata, train_data_medians, train_data_mads, train_data_std)
    normalized, _ = normalize(cleaned, all_feature_metadata, train_data_means, train_data_std)
    print t, normalized.shape
    normalized.to_csv('../' + t + '_data_vectorized.csv' ,sep='|')

normalized.head()

all (2205, 140)
test (600, 140)


Unnamed: 0_level_0,family_ALS_hist_last,Bulbar,Limb,Limb and Bulbar,weight_pct_diff,weight_last,weight_mean_slope,weight_mean,Sodium_pct_diff,Sodium_last,...,Platelets_mean,if_use_Riluzole_last,Protein_pct_diff,Protein_last,Protein_mean_slope,Protein_mean,ALT(SGPT)_pct_diff,ALT(SGPT)_last,ALT(SGPT)_mean_slope,ALT(SGPT)_mean
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
750059,0,-0.501303,0.512608,-0.085475,0.499238,0.0,0.064018,0.0,-0.95374,-2.118857,...,1.207904,-1.403994,1.076114,-0.867275,2.446519,-0.977041,2.058521,1.123163,0.870715,0.060424
750094,0,-0.501303,0.512608,-0.085475,0.0,-0.572601,1.631264,-0.679164,1.087614,0.0715,...,-0.358098,-1.403994,-1.284074,-0.323053,-1.611819,0.11939,0.238501,-0.24083,0.689877,-0.315502
750148,0,1.993896,-1.949922,-0.085475,0.220268,-1.235264,0.0,-1.437431,-1.746896,-1.680785,...,0.537818,0.711884,0.481678,0.49328,0.612127,0.02541,-1.828715,-1.150158,-2.472274,0.177309
750195,0,1.993896,-1.949922,-0.085475,-1.076816,1.595458,-1.129737,1.681382,0.078032,0.0715,...,-1.453929,-1.403994,0.0,0.49328,0.0,-0.392278,0.0,-0.422696,0.0,0.480577
750406,0,-0.501303,0.512608,-0.085475,0.0,0.81755,1.511481,0.62641,0.078032,0.0715,...,-0.435843,0.711884,0.0,1.309613,0.0,0.0,0.111243,-1.059226,-0.173197,-1.155807
