## Builds a model for vectorizing the raw data (apply it once on train and once on test) :
* pivot from the initial feature_name:feature_value form to a vector
* handle dummy variables: translate categoric variables into N-1 dummy variables (The model is based on categories in train data)
* handle time-series variables: reduce them in several hard-coded methods
* fill missing values with train data means, and normalize to z-scores with train data std


In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from vectorizing_funcs import *

In [2]:
df = pd.read_csv('../train_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
df.head()

Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,Demographic,Gender,F,,0.0
1,533,Demographic,Age,65,,0.0
2,533,Demographic,Race,White,,0.0
3,533,ALSHX,onset_delta,-1023,,0.0
4,533,ALSHX,diag_delta,-44,,0.0


# Build metadata: assign features to vectorizing functions
funcs_to_features arrays define pairs of funcs (can be a list of functions or a single one) and features that should get these functions calculated. Overlapping is allowed.

There is a list for time-series functions (as described before) and for dummy functions. Both are inverted to feature_to_funcs maps.

In [3]:
ts_funcs_to_features = [ 
    { 
        "funcs": [ ts_stats, ts_mean_slope, ts_pct_diff ],
        "features": [
            'ALSFRS_Total', 'weight', 'Albumin', 'Creatinine',
            'bp_diastolic', 'bp_systolic', 'pulse', 'respiratory_rate', 'temperature',
        ]
    },
    {
        "funcs": ts_last_value,
        "features": [
            'ALSFRS_Total', 'BMI', 'height', 'Age', 'onset_delta', 'Albumin', 'Creatinine',
        ]
    },
    { 
        "funcs": ts_pct_diff,
        "features": [ 
            'fvc_percent',
        ]
    },
    {
        "funcs": ts_last_boolean,
        "features": [
            'family_ALS_hist',
        ]
    }
]

dummy_funcs_to_features = [ 
    { 
        "funcs": apply_scalar_feature_to_dummies,
        "features": [ 'Gender', 'Race' ]
    }   
]

def invert_func_to_features(ftf, feature_type):
    res = {}
    for ff in ftf:
        funcs = ff['funcs']
        features = ff['features']
        if not type(funcs) is list:
            funcs = [funcs] # a single function
        for func in funcs: 
            for feature in features:
                if feature not in res:
                    res[feature] = {"feature_name": feature, "funcs": set(), 
                                    "feature_type": feature_type, "derived_features": set()}
                res[feature]["funcs"].add(func)
    return res
    
all_feature_metadata = invert_func_to_features(ts_funcs_to_features, "ts")
all_feature_metadata.update(invert_func_to_features(dummy_funcs_to_features, "dummy"))


## Learn to_dummies model
Which kind of categories do we have available in our train data?

In [4]:
def learn_to_dummies_model(df, all_feature_metadata):
    new_metadata = all_feature_metadata.copy()
    for feature, fv in all_feature_metadata.iteritems():
        if fv["feature_type"] == "dummy":
            for func in fv["funcs"]:
                new_metadata[feature]["derived_features"] = learn_scalar_feature_to_dummies(df, fv)
    return new_metadata

all_feature_metadata = learn_to_dummies_model(df, all_feature_metadata)

##Vectorize `train` data 

In [5]:

vectorized, all_feature_metadata = vectorize(df, all_feature_metadata, debug=True)
vectorized.head()


family_ALS_hist
weight
Gender
Age
respiratory_rate
pulse
height
Race
onset_delta
Albumin
ALSFRS_Total
temperature
bp_systolic
BMI
bp_diastolic
Creatinine
fvc_percent


Unnamed: 0_level_0,family_ALS_hist_last,weight_mean,weight_median,weight_std,weight_pct_diff,weight_mean_slope,F,M,Age_last,respiratory_rate_mean,...,bp_diastolic_std,bp_diastolic_pct_diff,bp_diastolic_mean_slope,Creatinine_mean,Creatinine_median,Creatinine_std,Creatinine_pct_diff,Creatinine_last,Creatinine_mean_slope,fvc_percent_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,,51.0,51.0,0.547723,-0.000108,0.004337,1,0,65,17.333333,...,7.402702,0.000265,-0.055548,79.56,79.56,0.0,0.0,79.56,0.0,-0.00722
649,,74.566667,74.4,0.960902,0.000156,0.035425,1,0,48,14.666667,...,2.309401,0.0,-0.0625,50.83,53.04,4.42,0.002564,53.04,0.271247,-0.000863
1234,,95.05,94.85,0.714143,0.000227,0.012698,0,1,38,22.0,...,8.062258,0.001071,-0.398557,88.4,88.4,0.0,0.0,88.4,0.0,0.002728
2492,,90.1,90.1,0.0,0.0,0.0,0,1,63,16.0,...,7.071068,-0.003268,-0.294118,66.3,66.3,6.250824,0.004202,70.72,0.26,-0.001989
2956,,65.1,65.15,0.469042,-5.9e-05,-0.032251,1,0,63,17.5,...,6.608076,-0.000371,0.340548,55.25,53.04,4.42,0.002165,61.88,0.038268,0.001571


## Filling empty values with means and normalizing
- NOTE that we have to use the `train` data means and std

In [6]:
train_data_means = vectorized.mean()
train_data_std = vectorized.std()            
normalized, all_feature_metadata = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
normalized.head()


Unnamed: 0_level_0,family_ALS_hist_last,weight_mean,weight_median,weight_std,weight_pct_diff,weight_mean_slope,F,M,Age_last,respiratory_rate_mean,...,bp_diastolic_std,bp_diastolic_pct_diff,bp_diastolic_mean_slope,Creatinine_mean,Creatinine_median,Creatinine_std,Creatinine_pct_diff,Creatinine_last,Creatinine_mean_slope,fvc_percent_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,0,-1.620202,-1.617145,-0.435513,-0.232289,-0.002066,1.322796,-1.322796,0.941297,-0.058702,...,0.345707,0.099808,-0.109689,0.547815,0.558684,-1.276692,-0.0252,0.563661,0.175924,-1.864018
649,0,-0.086445,-0.100773,-0.163334,0.218932,0.466682,1.322796,-1.322796,-0.558473,-1.039658,...,-1.053767,-0.019317,-0.127591,-1.117773,-0.958224,-0.357095,-0.02328,-0.863703,0.449074,-0.03798
1234,0,1.246644,1.224432,-0.325885,0.340632,0.124007,-0.755549,0.755549,-1.440691,1.657972,...,0.526932,0.462695,-0.993015,1.060304,1.064321,-1.276692,-0.0252,1.039449,0.175924,0.993322
2492,0,0.92449,0.916621,-0.796321,-0.047759,-0.06746,-0.755549,0.755549,0.764854,-0.54918,...,0.254585,-1.489507,-0.72406,-0.220918,-0.19977,0.023815,-0.022054,0.087873,0.437748,-0.361454
2956,0,-0.702551,-0.700194,-0.487343,-0.149486,-0.553742,1.322796,-1.322796,0.764854,0.002608,...,0.127369,-0.186248,0.910351,-0.861529,-0.958224,-0.357095,-0.023579,-0.387915,0.21446,0.661195


## Pickle all metadata we will need to use later when applying vectorizer

In [7]:
pickle.dump( all_feature_metadata, open('../all_feature_metadata.pickle', 'wb') )
pickle.dump( train_data_means, open('../train_data_means.pickle', 'wb') )
pickle.dump( train_data_std, open('../train_data_std.pickle', 'wb') )


## Apply model on `train`,  `test` 


In [8]:

for t in ["train", "test"]:
    df = pd.read_csv('../' + t + '_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
    vectorized, _ = vectorize(df, all_feature_metadata)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    print t, normalized.shape
    normalized.to_csv('../' + t + '_data_vectorized.csv' ,sep='|')

normalized.head()

train (1777, 62)
test (600, 62)


Unnamed: 0_level_0,family_ALS_hist_last,weight_mean,weight_median,weight_std,weight_pct_diff,weight_mean_slope,F,M,Age_last,respiratory_rate_mean,...,bp_diastolic_std,bp_diastolic_pct_diff,bp_diastolic_mean_slope,Creatinine_mean,Creatinine_median,Creatinine_std,Creatinine_pct_diff,Creatinine_last,Creatinine_mean_slope,fvc_percent_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
750059,0,2.120464,2.078067,-0.146931,0.300212,0.033383,-0.755549,0.755549,-0.029142,0.0,...,-0.109896,0.845833,0.986804,-0.078882,0.069064,-0.340451,-0.026243,-0.381457,0.142725,0.0
750094,0,-0.672179,-0.677513,0.258353,1.333537,0.740484,1.322796,-1.322796,0.853076,-1.530136,...,-0.101944,1.017268,0.717935,-0.513686,-0.445724,0.010049,-0.026143,-0.381457,-0.248462,0.061534
750148,0,-1.344999,-1.390337,0.949332,0.148987,-1.612401,1.322796,-1.322796,1.117741,-0.181321,...,0.955335,-0.490147,-1.242494,0.879956,0.927044,0.201766,-0.024907,0.53352,0.692215,0.08929
750195,0,1.422364,1.412358,-0.3771,-0.554141,-0.505208,-0.755549,0.755549,1.117741,-0.54918,...,0.254585,-1.631784,-0.797358,2.426941,2.581229,-0.214833,-0.0252,2.466813,-4.275089,0.209846
750406,0,0.486274,0.378763,0.991207,2.046137,0.686441,1.322796,-1.322796,0.500188,-0.917039,...,-0.38016,0.392659,0.150429,1.700915,1.569957,-0.357095,-0.0252,1.515237,0.332099,-0.146617
