## Builds a model for vectorizing the raw data (apply it once on train and once on test) :
* pivot from the initial feature_name:feature_value form to a vector
* handle dummy variables: translate categoric variables into N-1 dummy variables (The model is based on categories in train data)
* handle time-series variables: reduce them in several hard-coded methods
* fill missing values with train data means, and normalize to z-scores with train data std


In [1]:
from IPython.display import display

import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from vectorizing_funcs import *

In [2]:
df = pd.read_csv('../train_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
df.head()

Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,Demographic,Gender,F,,0.0
1,533,Demographic,Age,65,,0.0
2,533,Demographic,Race,White,,0.0
3,533,ALSHX,onset_delta,-1023,,0.0
4,533,ALSHX,diag_delta,-44,,0.0


# Build metadata: assign features to vectorizing functions
funcs_to_features arrays define pairs of funcs (can be a list of functions or a single one) and features that should get these functions calculated. Overlapping is allowed.

There is a list for time-series functions (as described before) and for dummy functions. Both are inverted to feature_to_funcs maps.

In [3]:
    
all_feature_metadata = invert_func_to_features(ts_funcs_to_features, "ts")
all_feature_metadata.update(invert_func_to_features(dummy_funcs_to_features, "dummy"))

## Learn to_dummies model
Which kind of categories do we have available in our train data?

In [4]:
all_feature_metadata = learn_to_dummies_model(df, all_feature_metadata)

##Vectorize `train` data 

In [7]:

vectorized, all_feature_metadata = vectorize(df, all_feature_metadata, debug=True)
vectorized.head()


family_ALS_hist
Hematocrit
Urine Specific Gravity
weight
Sodium
if_use_Riluzole
Absolute Band Neutrophil Count
Total Cholesterol
pulse
White Blood Cell (WBC)
Lactate Dehydrogenase
Monocytes
onset_delta
Lymphocytes
Albumin
Chloride
Bilirubin (Total)
ALSFRS_Total
Glucose
Alkaline Phosphatase
bp_systolic
Bicarbonate
Potassium
Prothrombin Time (clotting)
bp_diastolic
Absolute Eosinophil Count
Red Blood Cells (RBC)
Blood Urea Nitrogen (BUN)
Phosphorus
CK
Eosinophils
Absolute Neutrophil Count
Gender
Age
respiratory_rate
HbA1c (Glycated Hemoglobin)
Absolute Monocyte Count
Calcium
Urine Ph
mouth
Gamma-glutamyltransferase
hands
Basophils
Free T4
Neutrophils
Protein
BMI
temperature
respiratory
AST(SGOT)
Hemoglobin
Absolute Lymphocyte Count
Platelets
Uric Acid
Race
fvc_percent
Absolute Basophil Count
Creatinine
Triglycerides
ALT(SGPT)


Unnamed: 0_level_0,family_ALS_hist_last,Hematocrit_last,Hematocrit_mean_slope,Hematocrit_mean,Hematocrit_median,Hematocrit_std,Hematocrit_pct_diff,Urine Specific Gravity_last,Urine Specific Gravity_mean_slope,Urine Specific Gravity_mean,...,Triglycerides_mean,Triglycerides_median,Triglycerides_std,Triglycerides_pct_diff,ALT(SGPT)_last,ALT(SGPT)_mean_slope,ALT(SGPT)_mean,ALT(SGPT)_median,ALT(SGPT)_std,ALT(SGPT)_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,,43.6,0.031868,42.15,42.15,2.05061,0.000783,,,,...,,,,,24,0.065934,21.0,21.0,4.242641,0.003663
649,,40.3,0.032502,39.25,39.15,0.946925,0.000599,,,,...,1.935125,1.8193,0.470852,-0.004744,25,-0.072383,17.0,16.5,6.164414,0.007212
1234,,44.4,-0.033636,46.925,47.1,1.972097,-0.00096,,,,...,1.45205,1.42945,0.156169,0.000576,24,-0.100577,22.75,23.5,2.629956,0.000621
2492,,41.7,-0.038235,42.35,42.35,0.919239,-0.000889,,,,...,,,,,62,0.058824,61.0,61.0,1.414214,0.00098
2956,,42.5,0.005195,43.3,43.35,0.627163,-0.000299,,,,...,3.008625,2.9606,0.53109,0.00274,28,-0.050505,25.25,25.5,2.5,0.001558


In [8]:
vectorized.columns

Index([u'family_ALS_hist_last', u'Hematocrit_last', u'Hematocrit_mean_slope',
       u'Hematocrit_mean', u'Hematocrit_median', u'Hematocrit_std',
       u'Hematocrit_pct_diff', u'Urine Specific Gravity_last',
       u'Urine Specific Gravity_mean_slope', u'Urine Specific Gravity_mean', 
       ...
       u'Triglycerides_mean', u'Triglycerides_median', u'Triglycerides_std',
       u'Triglycerides_pct_diff', u'ALT(SGPT)_last', u'ALT(SGPT)_mean_slope',
       u'ALT(SGPT)_mean', u'ALT(SGPT)_median', u'ALT(SGPT)_std',
       u'ALT(SGPT)_pct_diff'],
      dtype='object', length=331)

## Filling empty values with means and normalizing
- NOTE that we have to use the `train` data means and std

In [10]:
train_data_means = vectorized.mean()
train_data_std = vectorized.std()            
normalized, all_feature_metadata = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
normalized.head()


Unnamed: 0_level_0,family_ALS_hist_last,Hematocrit_last,Hematocrit_mean_slope,Hematocrit_mean,Hematocrit_median,Hematocrit_std,Hematocrit_pct_diff,Urine Specific Gravity_last,Urine Specific Gravity_mean_slope,Urine Specific Gravity_mean,...,Triglycerides_mean,Triglycerides_median,Triglycerides_std,Triglycerides_pct_diff,ALT(SGPT)_last,ALT(SGPT)_mean_slope,ALT(SGPT)_mean,ALT(SGPT)_median,ALT(SGPT)_std,ALT(SGPT)_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,0,0.341017,0.328455,0.229323,0.231154,0.87538,0.458821,0,0,0,...,0.0,0.0,0.0,0.0,-0.527906,0.050588,-0.738027,-0.696299,-0.28309,0.218664
649,0,0.070538,0.333508,-0.010159,-0.016626,-0.48784,0.345206,0,0,0,...,-0.12405,-0.183709,-0.044,-0.201068,-0.480243,-0.07737,-0.953654,-0.936033,-0.089644,0.63652
1234,0,0.406587,-0.19348,0.623643,0.639992,0.778405,-0.620018,0,0,0,...,-0.470018,-0.46667,-0.660942,-0.075723,-0.527906,-0.103453,-0.64369,-0.563114,-0.445424,-0.139532
2492,0,0.185287,-0.230124,0.245839,0.247673,-0.522037,-0.575963,0,0,0,...,0.0,0.0,0.0,0.0,1.283284,0.04401,1.418247,1.434666,-0.567801,-0.097226
2956,0,0.250857,0.115924,0.32429,0.330266,-0.882795,-0.210464,0,0,0,...,0.64477,0.644668,0.074097,-0.024743,-0.337255,-0.057131,-0.508923,-0.456566,-0.458505,-0.029158


## Pickle all metadata we will need to use later when applying vectorizer

In [11]:
pickle.dump( all_feature_metadata, open('../all_feature_metadata.pickle', 'wb') )
pickle.dump( train_data_means, open('../train_data_means.pickle', 'wb') )
pickle.dump( train_data_std, open('../train_data_std.pickle', 'wb') )


## Apply model on `train`,  `test` 


In [None]:

for t in ["train", "test"]:
    df = pd.read_csv('../' + t + '_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
    vectorized, _ = vectorize(df, all_feature_metadata)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    print t, normalized.shape
    normalized.to_csv('../' + t + '_data_vectorized.csv' ,sep='|')

normalized.head()