## Builds a model for vectorizing the raw data (apply it once on train and once on test) :
* pivot from the initial feature_name:feature_value form to a vector
* handle dummy variables: translate categoric variables into N-1 dummy variables (The model is based on categories in train data)
* handle time-series variables: reduce them in several hard-coded methods
* fill missing values with train data means, and normalize to z-scores with train data std


In [199]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict

In [200]:
df = pd.read_csv('../train_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
df.head()

Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,Demographic,Gender,F,,0.0
1,533,Demographic,Age,65,,0.0
2,533,Demographic,Race,White,,0.0
3,533,ALSHX,onset_delta,-1023,,0.0
4,533,ALSHX,diag_delta,-44,,0.0


# Define all kind of vectorization and aggregation functions

## Global functions
Should receive (df, feature_name) and return a DataFrame with SubjectID as an index and columns for features

### Scalar -> Dummies

In [201]:
def scalar_feature_to_dummies_core(df, feature_metadata):
    my_slice = df[df.feature_name == feature_metadata["feature_name"]]
    my_slice_pivot = pd.pivot_table(my_slice, values = ['feature_value'], index = ['SubjectID'], 
                                columns = ['feature_name'], aggfunc = lambda x:x)
    dum = pd.get_dummies(my_slice_pivot['feature_value'][feature_metadata["feature_name"]])
    return dum

def learn_scalar_feature_to_dummies(df, feature_metadata):
    dum = scalar_feature_to_dummies_core(df, feature_metadata)
    return dum.columns

def apply_scalar_feature_to_dummies(df, feature_metadata):
    dum = scalar_feature_to_dummies_core(df, feature_metadata)
    return dum.reindex(columns = feature_metadata["derived_features"], fill_value=0)   

## Time Series functions
Are invoked per SubjectID and with the valid timeframe data only (<92 days). Should receive a DataFrame with 'feature_value', and 'feature_delta' and return a dict from col_suffix (e.g. "last", "mean", ...) to the value

NOTE: here theres no learned model - we apply the same hard-coded treatment 

### Timeseries -> Slope, %diff, stats

In [202]:
def ts_pct_diff(ts_data, feature_metadata):
    if len(ts_data) < 2:
        return { "pct_diff": None }
    
    ts_data_sorted = ts_data.sort('feature_delta')
    values = ts_data_sorted.feature_value.astype('float')
    time_values = ts_data_sorted.feature_delta.astype('float')

    time_diff = time_values.iloc[-1] - time_values.iloc[0]
    val = ( values.iloc[-1] - values.iloc[0] ) / ( values.iloc[0] * time_diff)
    if val == float('inf'):
        return { "pct_diff": None }
    
    return { "pct_diff": val }
    
def ts_stats(ts_data, feature_metadata):
    if len(ts_data) < 1:
        return { "mean": None, "std": None, "median": None }
    
    values = ts_data.feature_value.astype('float')
    return { "mean": values.mean(), "std": values.std(), "median": values.median() }
    
def ts_mean_slope(ts_data, feature_metadata):
    if len(ts_data) < 2:
        return { "mean_slope": None }
    
    ts_data_sorted = ts_data.sort('feature_delta') 
    ts_data_sorted.feature_value = ts_data_sorted.feature_value.astype('float')
    first, others = ts_data_sorted.iloc[0], ts_data_sorted.iloc[1:]
    slopes = [ ( x[1].feature_value - first.feature_value) / ( x[1].feature_delta - first.feature_delta ) for x in others.iterrows() ]
    slopes = [ x for x in slopes if x!=float('inf') ]
    return { "mean_slope": np.mean(slopes) }


## Timeseries -> last value

In [203]:
def ts_last_value(ts_data, feature_metadata):
    if len(ts_data) < 1:
        return { "last": None }
    
    ts_data_sorted = ts_data.sort('feature_delta') 
    return { "last": ts_data_sorted.feature_value.astype('float').iloc[-1] }

In [204]:
def ts_last_boolean(ts_data, feature_metadata):
    if len(ts_data) < 1:
        return { "last": None }
    val_str = str(ts_data.feature_value.iloc[-1]).lower()
    if val_str == 'y' or val_str == 'true':
        val = 1
    else:
        val = 0
    return { "last": val }
    

# Build metadata: assign features to vectorizing functions
funcs_to_features arrays define pairs of funcs (can be a list of functions or a single one) and features that should get these functions calculated. Overlapping is allowed.

There is a list for time-series functions (as described before) and for dummy functions. Both are inverted to feature_to_funcs maps.

In [206]:
ts_funcs_to_features = [ 
    { 
        "funcs": [ ts_stats, ts_mean_slope, ts_pct_diff ],
        "features": [
            'ALSFRS_Total', 'weight', 'Albumin', 'Creatinine',
            'bp_diastolic', 'bp_systolic', 'pulse', 'respiratory_rate', 'temperature',
        ]
    },
    {
        "funcs": ts_last_value,
        "features": [
            'ALSFRS_Total', 'BMI', 'height', 'Age', 'onset_delta', 'Albumin', 'Creatinine',
        ]
    },
    { 
        "funcs": ts_pct_diff,
        "features": [ 
            'fvc_percent',
        ]
    },
    {
        "funcs": ts_last_boolean,
        "features": [
            'family_ALS_hist',
        ]
    }
]

dummy_funcs_to_features = [ 
    { 
        "funcs": apply_scalar_feature_to_dummies,
        "features": [ 'Gender', 'Race' ]
    }   
]

def invert_func_to_features(ftf, feature_type):
    res = {}
    for ff in ftf:
        funcs = ff['funcs']
        features = ff['features']
        if not type(funcs) is list:
            funcs = [funcs] # a single function
        for func in funcs: 
            for feature in features:
                if feature not in res:
                    res[feature] = {"feature_name": feature, "funcs": set(), 
                                    "feature_type": feature_type, "derived_features": set()}
                res[feature]["funcs"].add(func)
    return res
    
all_feature_metadata = invert_func_to_features(ts_funcs_to_features, "ts")
all_feature_metadata.update(invert_func_to_features(dummy_funcs_to_features, "dummy"))


## Learn to_dummies model
Which kind of categories do we have available in our train data?

In [207]:
def learn_to_dummies_model(df, all_feature_metadata):
    new_metadata = all_feature_metadata.copy()
    for feature, fv in all_feature_metadata.iteritems():
        if fv["feature_type"] == "dummy":
            for func in fv["funcs"]:
                new_metadata[feature]["derived_features"] = learn_scalar_feature_to_dummies(df, fv)
    return new_metadata

all_feature_metadata = learn_to_dummies_model(df, all_feature_metadata)

##Vectorize `train` data 

In [208]:
def to_series(f):
    def foo(x, args):
        res = f(x, args)
        return pd.Series(res)
    return foo

def parse_feature_delta(fd):
    """ parse feature_delta which can be given in strange forms, such as '54;59' """
    if type(fd) is float or type(fd) is np.float64: return fd
    first_value = fd.split(';')[0]
    try:
        return float(first_value)
    except:
        return None


In [211]:

def vectorize(df, all_feature_metadata, debug=False):
    vectorized = pd.DataFrame(index=df.SubjectID.unique())
    df.loc[:,'feature_delta'] = df.feature_delta.apply(parse_feature_delta)
    pointintime_data = df[df.feature_delta < 92]
    pointintime_data = pointintime_data.drop_duplicates(subset = ['SubjectID', 'feature_name' ,'feature_delta'], take_last=True)
    new_metadata = all_feature_metadata.copy()
    for feature, fm in all_feature_metadata.iteritems():
        feature_ts_data = pointintime_data[pointintime_data.feature_name == feature]
        for func in fm["funcs"]:
            if fm["feature_type"] == "dummy":
                res = func(df, fm)
            elif fm["feature_type"] == "ts":    
                res = pd.DataFrame(feature_ts_data.groupby('SubjectID').apply(to_series(func), args=fm))
                res.columns = [ feature + "_" + str(col_suffix) for col_suffix in res.columns ]
                for col in res.columns:
                    new_metadata[feature]["derived_features"].add(col)
            else:
                raise Exception("unknown feature type: " + fv["feature_type"])
            vectorized = pd.merge(vectorized, res, how='left', right_index=True, left_index=True)
        if debug:
            print feature

    vectorized.index.name='SubjectID'
    return vectorized, new_metadata

In [212]:

vectorized, all_feature_metadata = vectorize(df, all_feature_metadata, debug=True)
vectorized.head()


family_ALS_hist
weight
Gender
Age
respiratory_rate
pulse
height
Race
onset_delta
Albumin
ALSFRS_Total
temperature
BMI
bp_diastolic
bp_systolic
Creatinine
fvc_percent


Unnamed: 0_level_0,family_ALS_hist_last,weight_pct_diff,weight_mean_slope,weight_mean,weight_median,weight_std,F,M,Age_last,respiratory_rate_pct_diff,...,bp_systolic_mean,bp_systolic_median,bp_systolic_std,Creatinine_last,Creatinine_pct_diff,Creatinine_mean_slope,Creatinine_mean,Creatinine_median,Creatinine_std,fvc_percent_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,,-0.000108,0.004337,51.0,51.0,0.547723,1,0,65,0.0,...,141.333333,139.5,10.1915,79.56,0.0,0.0,79.56,79.56,0.0,-0.00722
649,,0.000156,0.035425,74.566667,74.4,0.960902,1,0,48,0.001171,...,138.666667,140.0,2.309401,53.04,0.002564,0.271247,50.83,53.04,4.42,-0.000863
1234,,0.000227,0.012698,95.05,94.85,0.714143,0,1,38,0.002857,...,108.25,108.0,1.258306,88.4,0.0,0.0,88.4,88.4,0.0,0.002728
2492,,0.0,0.0,90.1,90.1,0.0,0,1,63,0.0,...,140.0,140.0,14.142136,70.72,0.004202,0.26,66.3,66.3,6.250824,-0.001989
2956,,-5.9e-05,-0.032251,65.1,65.15,0.469042,1,0,63,0.0,...,131.0,130.0,22.891046,61.88,0.002165,0.038268,55.25,53.04,4.42,0.001571


## Filling empty values with means and normalizing
- NOTE that we have to use the `train` data means and std

In [213]:
train_data_means = vectorized.mean()
train_data_std = vectorized.std()

def normalize(vectorized, all_feature_metadata, train_data_means, train_data_std):
    vectorized = vectorized.reindex(columns=train_data_means.keys())
    normalized = vectorized.fillna(train_data_means)
    for feature, fm in all_feature_metadata.iteritems():
        for col in fm["derived_features"]:
            data = normalized[col].astype('float')
            normalized.loc[:, col] = (data - train_data_means[col])/train_data_std[col]
    return normalized, all_feature_metadata
            
normalized, all_feature_metadata = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
normalized.head()


Unnamed: 0_level_0,family_ALS_hist_last,weight_pct_diff,weight_mean_slope,weight_mean,weight_median,weight_std,F,M,Age_last,respiratory_rate_pct_diff,...,bp_systolic_mean,bp_systolic_median,bp_systolic_std,Creatinine_last,Creatinine_pct_diff,Creatinine_mean_slope,Creatinine_mean,Creatinine_median,Creatinine_std,fvc_percent_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,0,-0.232289,-0.002066,-1.620202,-1.617145,-0.435513,1.322796,-1.322796,0.941297,-0.16934,...,0.725671,0.594934,0.235795,0.563661,-0.0252,0.175924,0.547815,0.558684,-1.276692,-1.864018
649,0,0.218932,0.466682,-0.086445,-0.100773,-0.163334,1.322796,-1.322796,-0.558473,0.169753,...,0.53552,0.629859,-1.313073,-0.863703,-0.02328,0.449074,-1.117773,-0.958224,-0.357095,-0.03798
1234,0,0.340632,0.124007,1.246644,1.224432,-0.325885,-0.755549,0.755549,-1.440691,0.658048,...,-1.633381,-1.60535,-1.519618,1.039449,-0.0252,0.175924,1.060304,1.064321,-1.276692,0.993322
2492,0,-0.047759,-0.06746,0.92449,0.916621,-0.796321,-0.755549,0.755549,0.764854,-0.16934,...,0.630595,0.629859,1.012112,0.087873,-0.022054,0.437748,-0.220918,-0.19977,0.023815,-0.361454
2956,0,-0.149486,-0.553742,-0.702551,-0.700194,-0.487343,1.322796,-1.322796,0.764854,-0.16934,...,-0.011162,-0.068644,2.731312,-0.387915,-0.023579,0.21446,-0.861529,-0.958224,-0.357095,0.661195


## Pickle all metadata we will need to use later when applying vectorizer

In [214]:
pickle.dump( all_feature_metadata, open('../all_feature_metadata.pickle', 'wb') )
pickle.dump( train_data_means, open('../train_data_means.pickle', 'wb') )
pickle.dump( train_data_std, open('../train_data_std.pickle', 'wb') )


## Apply model on `train`,  `test` 


In [215]:

for t in ["train", "test"]:
    df = pd.read_csv('../' + t + '_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
    vectorized, _ = vectorize(df, all_feature_metadata, debug=False)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    print t, normalized.shape
    normalized.to_csv('../' + t + '_data_vectorized.csv' ,sep='|')

normalized.head()

train (1777, 62)
test (600, 62)


Unnamed: 0_level_0,family_ALS_hist_last,weight_pct_diff,weight_mean_slope,weight_mean,weight_median,weight_std,F,M,Age_last,respiratory_rate_pct_diff,...,bp_systolic_mean,bp_systolic_median,bp_systolic_std,Creatinine_last,Creatinine_pct_diff,Creatinine_mean_slope,Creatinine_mean,Creatinine_median,Creatinine_std,fvc_percent_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
750059,0,0.300212,0.033383,2.120464,2.078067,-0.146931,-0.755549,0.755549,-0.029142,0.0,...,-0.296387,-0.417895,-0.463418,-0.381457,-0.026243,0.142725,-0.078882,0.069064,-0.340451,0.0
750094,0,1.333537,0.740484,-0.672179,-0.677513,0.258353,1.322796,-1.322796,0.853076,1.387573,...,0.392908,-0.068644,2.323686,-0.381457,-0.026143,-0.248462,-0.513686,-0.445724,0.010049,0.061534
750148,0,0.148987,-1.612401,-1.344999,-1.390337,0.949332,1.322796,-1.322796,1.117741,-0.87651,...,0.793581,0.629859,-1.025403,0.53352,-0.024907,0.692215,0.879956,0.927044,0.201766,0.08929
750195,0,-0.554141,-0.505208,1.422364,1.412358,-0.3771,-0.755549,0.755549,1.117741,-0.16934,...,-0.439,-0.417895,-0.377384,2.466813,-0.0252,-4.275089,2.426941,2.581229,-0.214833,0.209846
750406,0,2.046137,0.686441,0.486274,0.378763,0.991207,1.322796,-1.322796,0.500188,-1.031203,...,-0.545959,-0.487745,-0.593325,1.515237,-0.0252,0.332099,1.700915,1.569957,-0.357095,-0.146617


Test subject by subject, as thats the required mod-op in production

In [216]:
t = "test"
df = pd.read_csv('../' + t + '_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
stack = None
for subj in df.SubjectID.unique()[:5]:
    df_subj = df[df.SubjectID == subj]
    vectorized, _ = vectorize(df_subj, all_feature_metadata)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    if stack is None:
        stack = normalized
    else: 
        stack = stack.append(normalized)

print t, stack.shape
stack.head()

test (5, 62)


Unnamed: 0_level_0,family_ALS_hist_last,weight_pct_diff,weight_mean_slope,weight_mean,weight_median,weight_std,F,M,Age_last,respiratory_rate_pct_diff,...,bp_systolic_mean,bp_systolic_median,bp_systolic_std,Creatinine_last,Creatinine_pct_diff,Creatinine_mean_slope,Creatinine_mean,Creatinine_median,Creatinine_std,fvc_percent_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
750059,0,0.300212,0.033383,2.120464,2.078067,-0.146931,-0.755549,0.755549,-0.029142,0.0,...,-0.296387,-0.417895,-0.463418,-0.381457,-0.026243,0.142725,-0.078882,0.069064,-0.340451,0.0
750094,0,1.333537,0.740484,-0.672179,-0.677513,0.258353,1.322796,-1.322796,0.853076,1.387573,...,0.392908,-0.068644,2.323686,-0.381457,-0.026143,-0.248462,-0.513686,-0.445724,0.010049,0.061534
750148,0,0.148987,-1.612401,-1.344999,-1.390337,0.949332,1.322796,-1.322796,1.117741,-0.87651,...,0.793581,0.629859,-1.025403,0.53352,-0.024907,0.692215,0.879956,0.927044,0.201766,0.08929
750195,0,-0.554141,-0.505208,1.422364,1.412358,-0.3771,-0.755549,0.755549,1.117741,-0.16934,...,-0.439,-0.417895,-0.377384,2.466813,-0.0252,-4.275089,2.426941,2.581229,-0.214833,0.209846
750406,0,2.046137,0.686441,0.486274,0.378763,0.991207,1.322796,-1.322796,0.500188,-1.031203,...,-0.545959,-0.487745,-0.593325,1.515237,-0.0252,0.332099,1.700915,1.569957,-0.357095,-0.146617
