## Builds a model for vectorizing the raw data (apply it once on train and once on test) :
* pivot from the initial feature_name:feature_value form to a vector
* handle dummy variables: translate categoric variables into N-1 dummy variables (The model is based on categories in train data)
* handle time-series variables: reduce them in several hard-coded methods
* fill missing values with train data means, and normalize to z-scores with train data std


In [14]:
from IPython.display import display

import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from vectorizing_funcs import *

In [15]:
df = pd.read_csv('../all_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
df.head()

Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,Demographic,Gender,F,,0.0
1,533,Demographic,Age,65,,0.0
2,533,Demographic,Race,White,,0.0
3,533,ALSHX,onset_delta,-1023,,0.0
4,533,ALSHX,diag_delta,-44,,0.0


# Build metadata: assign features to vectorizing functions
funcs_to_features arrays define pairs of funcs (can be a list of functions or a single one) and features that should get these functions calculated. Overlapping is allowed.

There is a list for time-series functions (as described before) and for dummy functions. Both are inverted to feature_to_funcs maps.

In [16]:
ts_funcs_to_features = add_frequent_lab_tests_to_ts_features(df, ts_funcs_to_features)    
all_feature_metadata = invert_func_to_features(ts_funcs_to_features, "ts")
all_feature_metadata.update(invert_func_to_features(dummy_funcs_to_features, "dummy"))

## Learn to_dummies model
Which kind of categories do we have available in our train data?

In [17]:
all_feature_metadata = learn_to_dummies_model(df, all_feature_metadata)

##Vectorize `train` data 

In [18]:

vectorized, all_feature_metadata = vectorize(df, all_feature_metadata, debug=True)
vectorized.head()


family_ALS_hist
Hematocrit
weight
Sodium
hands
pulse
White Blood Cell (WBC)
Albumin
Chloride
ALSFRS_Total
Glucose
Alkaline Phosphatase
bp_systolic
Bicarbonate
Potassium
BMI
bp_diastolic
Red Blood Cells (RBC)
Blood Urea Nitrogen (BUN)
Phosphorus
CK
fvc_percent
Gender
Age
respiratory_rate
Calcium
Urine Ph
mouth
Gamma-glutamyltransferase
Total Cholesterol
onset_delta
Bilirubin (Total)
Creatinine
temperature
respiratory
AST(SGOT)
Hemoglobin
Platelets
Race
if_use_Riluzole
Absolute Basophil Count
Protein
ALT(SGPT)


Unnamed: 0_level_0,family_ALS_hist_last,Hematocrit_mean,Hematocrit_pct_diff,Hematocrit_last,Hematocrit_mean_slope,weight_mean,weight_pct_diff,weight_last,weight_mean_slope,Sodium_mean,...,Absolute Basophil Count_last,Absolute Basophil Count_mean_slope,Protein_mean,Protein_pct_diff,Protein_last,Protein_mean_slope,ALT(SGPT)_mean,ALT(SGPT)_pct_diff,ALT(SGPT)_last,ALT(SGPT)_mean_slope
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,,42.15,0.000783,43.6,0.031868,51.0,-0.000108,50.5,0.004337,144.5,...,,,69.5,-0.000157,69.0,-0.010989,21.0,0.003663,24,0.065934
649,,39.25,0.000599,40.3,0.032502,74.566667,0.000156,74.4,0.035425,137.5,...,0.02,-0.002105,71.0,0.000549,73.0,0.058039,17.0,0.007212,25,-0.072383
1234,,46.925,-0.00096,44.4,-0.033636,95.05,0.000227,96.0,0.012698,142.0,...,0.06,-0.000398,74.5,-0.001099,72.0,-0.18153,22.75,0.000621,24,-0.100577
2492,,42.35,-0.000889,41.7,-0.038235,90.1,0.0,90.1,0.0,138.5,...,,,,,,,61.0,0.00098,62,0.058824
2956,,43.3,-0.000299,42.5,0.005195,65.1,-5.9e-05,65.3,-0.032251,139.75,...,0.04,-0.000996,73.5,0.000732,75.0,0.096681,25.25,0.001558,28,-0.050505


In [19]:
vectorized.describe().transpose().sort("count", ascending=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
family_ALS_hist_last,375,0.210667,0.408327,0.000000,0.000000,0.000000,0.000000,1.000000
temperature_pct_diff,549,0.000038,0.001288,-0.001936,-0.000180,0.000000,0.000101,0.026853
temperature_mean_slope,549,-0.000431,0.052765,-0.200714,-0.014698,-0.002313,0.008401,0.909242
temperature_mean,552,36.562313,0.705195,31.700000,36.333333,36.600000,36.800000,49.240000
temperature_last,552,36.639847,2.690229,34.500000,36.200000,36.600000,36.900000,98.600000
Absolute Basophil Count_pct_diff,1223,0.003549,0.022276,-0.035714,-0.007442,-0.000343,0.006757,0.250000
BMI_last,1363,0.002577,0.000435,0.001528,0.002272,0.002520,0.002825,0.004731
CK_pct_diff,1606,0.004682,0.012068,-0.039435,-0.001244,0.002109,0.007088,0.171150
CK_mean_slope,1606,1.149725,9.984158,-66.363194,-0.621280,0.449471,2.174980,249.280809
Absolute Basophil Count_mean_slope,1607,-0.000066,0.004796,-0.047328,-0.000507,0.000000,0.000463,0.159286


## Filling empty values with means and normalizing
- NOTE that we have to use the `train` data means and std

In [20]:
train_data_means = vectorized.mean()
train_data_std = vectorized.std()            
normalized, all_feature_metadata = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
normalized.describe().T.sort("std", ascending=True)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
family_ALS_hist_last,2424,-1.099231e-17,0.392879,-0.515927,0.000000,0.000000,0.000000,1.933092
temperature_pct_diff,2424,-8.885448e-18,0.475569,-1.532341,0.000000,0.000000,0.000000,20.819543
temperature_mean_slope,2424,2.152660e-18,0.475569,-3.795729,0.000000,0.000000,0.000000,17.239985
temperature_last,2424,2.038775e-15,0.476869,-0.795415,0.000000,0.000000,0.000000,23.031556
temperature_mean,2424,1.604968e-15,0.476869,-6.894990,0.000000,0.000000,0.000000,17.977563
Absolute Basophil Count_pct_diff,2424,-9.045753e-18,0.710164,-1.762579,-0.212634,0.000000,0.000000,11.063385
BMI_last,2424,6.651033e-16,0.749742,-2.409737,-0.248838,0.000000,0.026371,4.946826
CK_pct_diff,2424,1.106559e-16,0.813881,-3.655655,-0.353794,0.000000,0.000000,13.794001
CK_mean_slope,2424,1.209741e-16,0.813881,-6.762005,-0.114615,0.000000,0.000000,24.852481
Absolute Basophil Count_mean_slope,2424,2.719451e-17,0.814134,-9.854126,-0.014655,0.000000,0.053421,33.224835


## Pickle all metadata we will need to use later when applying vectorizer

In [21]:
pickle.dump( all_feature_metadata, open('../all_feature_metadata.pickle', 'wb') )
pickle.dump( train_data_means, open('../all_data_means.pickle', 'wb') )
pickle.dump( train_data_std, open('../all_data_std.pickle', 'wb') )


## Apply model on `train`,  `test` 


In [13]:

for t in ["all", "test"]:
    df = pd.read_csv('../' + t + '_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
    vectorized, _ = vectorize(df, all_feature_metadata)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    print t, normalized.shape
    normalized.to_csv('../' + t + '_data_vectorized.csv' ,sep='|')

normalized.head()

all (2424, 157)
test (600, 157)


Unnamed: 0_level_0,family_ALS_hist_last,Hematocrit_mean,Hematocrit_pct_diff,Hematocrit_last,Hematocrit_mean_slope,weight_mean,weight_pct_diff,weight_last,weight_mean_slope,Sodium_mean,...,Absolute Basophil Count_last,Absolute Basophil Count_mean_slope,Protein_mean,Protein_pct_diff,Protein_last,Protein_mean_slope,ALT(SGPT)_mean,ALT(SGPT)_pct_diff,ALT(SGPT)_last,ALT(SGPT)_mean_slope
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
750059,0,0.308109,0.774371,0.421352,1.164553,2.075785,0.306885,2.15478,0.036625,-0.78974,...,-0.356671,0.013707,-0.903013,0.415514,-0.772484,0.593836,-0.174544,0.664004,0.241958,0.183859
750094,0,0.176395,-0.700553,0.127252,-1.523341,-0.662893,1.36309,-0.555051,0.764636,-0.284365,...,-0.342738,-0.619241,0.054248,-0.610013,-0.306977,-0.345604,-0.375178,-0.069324,-0.303835,0.152007
750148,0,-0.358693,-0.678796,-0.607998,0.222221,-1.322711,0.152312,-1.138209,-1.657827,0.959634,...,-0.356671,0.114143,-0.027803,0.157225,0.391284,0.169204,-0.112162,-0.902253,-0.667697,-0.404947
750195,0,0.017241,0.684259,0.086405,-0.537463,1.391175,-0.566384,1.352893,-0.517893,0.026635,...,-0.335771,1.123467,-0.392474,1.372974,0.391284,-1.037765,0.049693,-1.543317,-0.376607,2.262823
750406,0,-0.313416,0.075976,-0.354745,0.39358,0.473174,2.091468,0.668315,0.708994,0.259885,...,-0.335771,0.065073,2.032587,-1.399655,1.089545,-1.157264,-0.823653,-0.120599,-0.631311,-7e-06
