## Builds a model for vectorizing the raw data (apply it once on train and once on test) :
* pivot from the initial feature_name:feature_value form to a vector
* handle dummy variables: translate categoric variables into N-1 dummy variables (The model is based on categories in train data)
* handle time-series variables: reduce them in several hard-coded methods
* fill missing values with train data means, and normalize to z-scores with train data std


In [10]:
from IPython.display import display

import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from vectorizing_funcs import *

In [11]:
df = pd.read_csv('../all_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
df.head()

Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,Demographic,Gender,F,,0.0
1,533,Demographic,Age,65,,0.0
2,533,Demographic,Race,White,,0.0
3,533,ALSHX,onset_delta,-1023,,0.0
4,533,ALSHX,diag_delta,-44,,0.0


# Build metadata: assign features to vectorizing functions
funcs_to_features arrays define pairs of funcs (can be a list of functions or a single one) and features that should get these functions calculated. Overlapping is allowed.

There is a list for time-series functions (as described before) and for dummy functions. Both are inverted to feature_to_funcs maps.

In [12]:
ts_funcs_to_features = add_frequent_lab_tests_to_ts_features(df, ts_funcs_to_features)    
all_feature_metadata = invert_func_to_features(ts_funcs_to_features, "ts")
all_feature_metadata.update(invert_func_to_features(dummy_funcs_to_features, "dummy"))

## Learn to_dummies model
Which kind of categories do we have available in our train data?

In [13]:
all_feature_metadata = learn_to_dummies_model(df, all_feature_metadata)

##Vectorize `train` data 

In [14]:

vectorized, all_feature_metadata = vectorize(df, all_feature_metadata, debug=True)
vectorized.head()


family_ALS_hist
onset_site
weight
Sodium
hands
pulse
White Blood Cell (WBC)
Albumin
Chloride
ALSFRS_Total
Glucose
Alkaline Phosphatase
bp_systolic
fvc_percent
Potassium
BMI
bp_diastolic
Blood Urea Nitrogen (BUN)
Phosphorus
CK
Gender
Age
respiratory_rate
Calcium
Race
mouth
Gamma-glutamyltransferase
Total Cholesterol
onset_delta
Bilirubin (Total)
Creatinine
temperature
respiratory
AST(SGOT)
Hemoglobin
Platelets
if_use_Riluzole
Protein
ALT(SGPT)


Unnamed: 0_level_0,family_ALS_hist_last,Bulbar,Limb,Limb and Bulbar,weight_last,weight_mean_slope,weight_mean,weight_pct_diff,Sodium_last,Sodium_mean_slope,...,Platelets_pct_diff,if_use_Riluzole_last,Protein_last,Protein_mean_slope,Protein_mean,Protein_pct_diff,ALT(SGPT)_last,ALT(SGPT)_mean_slope,ALT(SGPT)_mean,ALT(SGPT)_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,,1,0,0,50.5,0.004337,51.0,-0.000108,146,0.032967,...,-0.001105,,69.0,-0.010989,69.5,-0.000157,24,0.065934,21.0,0.003663
649,,1,0,0,74.4,0.035425,74.566667,0.000156,136,0.092037,...,-0.00251,1.0,73.0,0.058039,71.0,0.000549,25,-0.072383,17.0,0.007212
1234,,1,0,0,96.0,0.012698,95.05,0.000227,142,-0.095671,...,-0.000532,1.0,72.0,-0.18153,74.5,-0.001099,24,-0.100577,22.75,0.000621
2492,,0,1,0,90.1,0.0,90.1,0.0,139,0.029412,...,,,,,,,62,0.058824,61.0,0.00098
2956,,0,1,0,65.3,-0.032251,65.1,-5.9e-05,142,0.012987,...,0.000281,0.0,75.0,0.096681,73.5,0.000732,28,-0.050505,25.25,0.001558


In [15]:
vectorized.describe().transpose().sort("count", ascending=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
family_ALS_hist_last,346,0.208092,0.406531,0.000000,0.000000,0.000000,0.000000,1.000000
temperature_pct_diff,454,0.000047,0.001412,-0.001936,-0.000190,0.000000,0.000101,0.026853
temperature_mean_slope,454,0.000713,0.056535,-0.200714,-0.013833,-0.002532,0.007399,0.909242
temperature_mean,457,36.563308,0.755195,31.700000,36.333333,36.580000,36.800000,49.240000
temperature_last,457,36.654184,2.948809,34.500000,36.200000,36.600000,36.900000,98.600000
BMI_last,1280,0.002577,0.000436,0.001528,0.002267,0.002518,0.002826,0.004731
CK_pct_diff,1423,0.005245,0.012222,-0.039435,-0.000822,0.002551,0.007517,0.171150
CK_mean_slope,1423,1.293199,10.441481,-66.363194,-0.492879,0.577354,2.311193,249.280809
Gamma-glutamyltransferase_mean_slope,1539,0.017835,0.942537,-14.450537,-0.071429,0.030719,0.167995,12.385714
Gamma-glutamyltransferase_pct_diff,1539,0.001713,0.007553,-0.026243,-0.001616,0.000442,0.003206,0.155012


## Filling empty values with means and normalizing
- NOTE that we have to use the `train` data means and std

In [16]:
train_data_means = vectorized.mean()
train_data_std = vectorized.std()            
normalized, all_feature_metadata = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
normalized.describe().T.sort("std", ascending=True)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
family_ALS_hist_last,2205,-9.566548e-18,0.395643,-0.511873,0.000000,0.000000,0.000000,1.947962
temperature_mean_slope,2205,5.538528e-19,0.453360,-3.562848,0.000000,0.000000,0.000000,16.070094
temperature_pct_diff,2205,9.616898e-18,0.453360,-1.403764,0.000000,0.000000,0.000000,18.980697
temperature_last,2205,-7.835506e-16,0.454859,-0.730527,0.000000,0.000000,0.000000,21.007063
temperature_mean,2205,4.660721e-15,0.454859,-6.439803,0.000000,0.000000,0.000000,16.785980
BMI_last,2205,5.547087e-16,0.761780,-2.405262,-0.276883,0.000000,0.057740,4.940542
CK_mean_slope,2205,7.112524e-17,0.803238,-6.479578,-0.110630,0.000000,0.000000,23.750234
CK_pct_diff,2205,7.411557e-17,0.803238,-3.655582,-0.345987,0.000000,0.000000,13.574082
Gamma-glutamyltransferase_mean_slope,2205,4.330122e-18,0.835357,-15.350453,-0.037447,0.000000,0.075386,13.121901
Gamma-glutamyltransferase_pct_diff,2205,1.346869e-16,0.835357,-3.701246,-0.331812,0.000000,0.013115,20.296371


## Pickle all metadata we will need to use later when applying vectorizer

In [17]:
pickle.dump( all_feature_metadata, open('../all_feature_metadata.pickle', 'wb') )
pickle.dump( train_data_means, open('../all_data_means.pickle', 'wb') )
pickle.dump( train_data_std, open('../all_data_std.pickle', 'wb') )


## Apply model on `train`,  `test` 


In [18]:

for t in ["all", "test"]:
    df = pd.read_csv('../' + t + '_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
    vectorized, _ = vectorize(df, all_feature_metadata)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    print t, normalized.shape
    normalized.to_csv('../' + t + '_data_vectorized.csv' ,sep='|')

normalized.head()

all (2205, 140)
test (600, 140)


Unnamed: 0_level_0,family_ALS_hist_last,Bulbar,Limb,Limb and Bulbar,weight_last,weight_mean_slope,weight_mean,weight_pct_diff,Sodium_last,Sodium_mean_slope,...,Platelets_pct_diff,if_use_Riluzole_last,Protein_last,Protein_mean_slope,Protein_mean,Protein_pct_diff,ALT(SGPT)_last,ALT(SGPT)_mean_slope,ALT(SGPT)_mean,ALT(SGPT)_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
750059,0,-0.501303,0.512608,-0.085475,2.136302,0.015749,2.061448,0.267218,-1.797715,0.601348,...,-0.459575,-1.403994,-0.776848,0.589301,-0.901735,0.407644,0.225999,0.1793,-0.18317,0.648286
750094,0,-0.501303,0.512608,-0.085475,-0.577366,0.762399,-0.680211,1.355411,0.081373,0.396271,...,-0.443557,-1.403994,-0.308146,-0.325235,0.057526,-0.594551,-0.307925,0.148785,-0.380253,-0.073179
750148,0,1.993896,-1.949922,-0.085475,-1.161351,-1.722087,-1.340747,0.107964,-1.421897,0.133763,...,-0.013086,0.711884,0.394909,0.175925,-0.024696,0.155231,-0.663874,-0.384791,-0.121892,-0.892634
750195,0,1.993896,-1.949922,-0.085475,1.333279,-0.552968,1.376093,-0.632499,0.081373,0.139004,...,2.080883,-1.403994,0.394909,-0.999048,-0.390129,1.34332,-0.379115,2.170999,0.0371,-1.523329
750406,0,-0.501303,0.512608,-0.085475,0.647732,0.705334,0.457092,2.105849,0.081373,0.258216,...,-0.171343,0.711884,1.097963,-1.115379,2.039998,-1.366227,-0.628279,0.003152,-0.820793,-0.123625
