## Builds a model for vectorizing the raw data (apply it once on train and once on test) :
* pivot from the initial feature_name:feature_value form to a vector
* handle dummy variables: translate categoric variables into N-1 dummy variables (The model is based on categories in train data)
* handle time-series variables: reduce them in several hard-coded methods
* fill missing values with train data means, and normalize to z-scores with train data std


In [1]:
from IPython.display import display

import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from vectorizing_funcs import *

In [2]:
df = pd.read_csv('../train_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
df.head()

Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,Demographic,Gender,F,,0.0
1,533,Demographic,Age,65,,0.0
2,533,Demographic,Race,White,,0.0
3,533,ALSHX,onset_delta,-1023,,0.0
4,533,ALSHX,diag_delta,-44,,0.0


# Build metadata: assign features to vectorizing functions
funcs_to_features arrays define pairs of funcs (can be a list of functions or a single one) and features that should get these functions calculated. Overlapping is allowed.

There is a list for time-series functions (as described before) and for dummy functions. Both are inverted to feature_to_funcs maps.

In [3]:
    
all_feature_metadata = invert_func_to_features(ts_funcs_to_features, "ts")
all_feature_metadata.update(invert_func_to_features(dummy_funcs_to_features, "dummy"))

## Learn to_dummies model
Which kind of categories do we have available in our train data?

In [4]:
all_feature_metadata = learn_to_dummies_model(df, all_feature_metadata)

##Vectorize `train` data 

In [5]:

vectorized, all_feature_metadata = vectorize(df, all_feature_metadata, debug=True)
vectorized.head()


family_ALS_hist
Hematocrit
Prothrombin Time (clotting)
weight
Sodium
hands
Total Cholesterol
pulse
White Blood Cell (WBC)
Monocytes
Lymphocytes
Albumin
Chloride
Neutrophils
ALSFRS_Total
Glucose
Alkaline Phosphatase
bp_systolic
Bicarbonate
Potassium
Urine Specific Gravity
bp_diastolic
Absolute Eosinophil Count
Red Blood Cells (RBC)
Blood Urea Nitrogen (BUN)
Phosphorus
CK
Eosinophils
fvc_percent
Absolute Neutrophil Count
Gender
Age
respiratory_rate
HbA1c (Glycated Hemoglobin)
Absolute Monocyte Count
Calcium
Urine Ph
mouth
Gamma-glutamyltransferase
Lactate Dehydrogenase
Basophils
onset_delta
Bilirubin (Total)
Creatinine
BMI
temperature
respiratory
AST(SGOT)
Hemoglobin
Absolute Lymphocyte Count
Platelets
Triglycerides
Race
if_use_Riluzole
Absolute Basophil Count
Protein
Uric Acid
ALT(SGPT)


Unnamed: 0_level_0,family_ALS_hist_last,Hematocrit_pct_diff,Hematocrit_last,Hematocrit_mean_slope,Hematocrit_mean,Prothrombin Time (clotting)_pct_diff,Prothrombin Time (clotting)_last,Prothrombin Time (clotting)_mean_slope,Prothrombin Time (clotting)_mean,weight_pct_diff,...,Protein_mean_slope,Protein_mean,Uric Acid_pct_diff,Uric Acid_last,Uric Acid_mean_slope,Uric Acid_mean,ALT(SGPT)_pct_diff,ALT(SGPT)_last,ALT(SGPT)_mean_slope,ALT(SGPT)_mean
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,,0.000783,43.6,0.031868,42.15,,10.9,,10.9,-0.000108,...,-0.010989,69.5,,,,,0.003663,24,0.065934,21.0
649,,0.000599,40.3,0.032502,39.25,,,,,0.000156,...,0.058039,71.0,,,,,0.007212,25,-0.072383,17.0
1234,,-0.00096,44.4,-0.033636,46.925,,,,,0.000227,...,-0.18153,74.5,,,,,0.000621,24,-0.100577,22.75
2492,,-0.000889,41.7,-0.038235,42.35,,,,,0.0,...,,,,,,,0.00098,62,0.058824,61.0
2956,,-0.000299,42.5,0.005195,43.3,,,,,-5.9e-05,...,0.096681,73.5,,,,,0.001558,28,-0.050505,25.25


In [11]:
vectorized.describe().transpose().sort("count", ascending=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Uric Acid_mean_slope,14,-0.120236,1.235131,-1.835064,-1.122492,-0.218676,1.017733,1.878167
Uric Acid_pct_diff,14,0.000231,0.003514,-0.004902,-0.002327,0.000000,0.003350,0.004938
Urine Specific Gravity_pct_diff,16,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Urine Specific Gravity_mean_slope,16,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Prothrombin Time (clotting)_pct_diff,16,-0.002309,0.010642,-0.042078,0.000000,0.000000,0.001118,0.002361
Prothrombin Time (clotting)_mean_slope,16,-0.057315,0.247498,-0.984615,0.000000,0.000000,0.010577,0.025487
Lactate Dehydrogenase_pct_diff,23,-0.001245,0.003315,-0.005741,-0.003181,-0.002107,-0.000413,0.008750
Lactate Dehydrogenase_mean_slope,23,-0.345452,0.679580,-1.470029,-0.734821,-0.396724,-0.130013,1.400000
Urine Specific Gravity_last,38,1.012526,0.010631,1.000000,1.000000,1.015000,1.020750,1.032000
Urine Specific Gravity_mean,38,1.012526,0.010631,1.000000,1.000000,1.015000,1.020750,1.032000


## Filling empty values with means and normalizing
- NOTE that we have to use the `train` data means and std

In [10]:
train_data_means = vectorized.mean()
train_data_std = vectorized.std()            
normalized, all_feature_metadata = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
normalized.describe().T.sort("std", ascending=False)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
White,600,-1.427562e-16,1.000000,-4.280683,0.233218,0.233218,0.233218,0.233218
Other,600,-1.540897e-16,1.000000,-0.100420,-0.100420,-0.100420,-0.100420,9.941579
Black,600,1.699104e-16,1.000000,-0.081855,-0.081855,-0.081855,-0.081855,12.196379
Asian,600,-1.544598e-16,1.000000,-0.108557,-0.108557,-0.108557,-0.108557,9.196363
M,600,-8.067621e-17,1.000000,-1.371668,-1.371668,0.727824,0.727824,0.727824
F,600,8.067621e-17,1.000000,-0.727824,-0.727824,-0.727824,1.371668,1.371668
weight_mean,600,-4.464854e-15,1.000000,-2.245890,-0.691216,-0.100135,0.592302,3.800597
Age_last,600,1.887379e-16,1.000000,-3.133269,-0.682762,0.077740,0.753742,2.021246
weight_last,600,-5.516976e-15,1.000000,-2.304977,-0.687073,-0.087849,0.575791,3.777144
Hispanic,600,3.745384e-16,1.000000,-0.057783,-0.057783,-0.057783,-0.057783,17.277201


## Pickle all metadata we will need to use later when applying vectorizer

In [8]:
pickle.dump( all_feature_metadata, open('../all_feature_metadata.pickle', 'wb') )
pickle.dump( train_data_means, open('../train_data_means.pickle', 'wb') )
pickle.dump( train_data_std, open('../train_data_std.pickle', 'wb') )


## Apply model on `train`,  `test` 


In [9]:

for t in ["train", "test"]:
    df = pd.read_csv('../' + t + '_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
    vectorized, _ = vectorize(df, all_feature_metadata)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    print t, normalized.shape
    normalized.to_csv('../' + t + '_data_vectorized.csv' ,sep='|')

normalized.head()

train (1777, 217)
test (600, 217)


Unnamed: 0_level_0,family_ALS_hist_last,Hematocrit_pct_diff,Hematocrit_last,Hematocrit_mean_slope,Hematocrit_mean,Prothrombin Time (clotting)_pct_diff,Prothrombin Time (clotting)_last,Prothrombin Time (clotting)_mean_slope,Prothrombin Time (clotting)_mean,weight_pct_diff,...,Protein_mean_slope,Protein_mean,Uric Acid_pct_diff,Uric Acid_last,Uric Acid_mean_slope,Uric Acid_mean,ALT(SGPT)_pct_diff,ALT(SGPT)_last,ALT(SGPT)_mean_slope,ALT(SGPT)_mean
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
750059,0,0.718587,0.42298,1.239825,0.307774,0,0,0,0,0.300212,...,0.601381,-0.896485,0,0,0,0,0.667253,0.330026,0.169942,-0.185482
750094,0,-0.62846,0.127913,-1.594938,0.175646,0,0,0,0,1.333537,...,-0.371369,0.06854,0,0,0,0,-0.078095,-0.384918,0.140473,-0.414586
750148,0,-0.608589,-0.609756,0.246003,-0.361125,0,0,0,0,0.148987,...,0.161693,-0.014176,0,0,0,0,-0.924677,-0.861547,-0.37484,-0.114248
750195,0,0.636287,0.086931,-0.555191,0.015991,0,0,0,0,-0.554141,...,-1.088073,-0.381805,0,0,0,0,-1.576249,-0.480243,2.093469,0.070576
750406,0,0.080744,-0.35567,0.426726,-0.315706,0,0,0,0,2.046137,...,-1.211808,2.062926,0,0,0,0,-0.130211,-0.813884,-0.000176,-0.926701
