In [23]:
## Used for vectorizing the raw data (run it once on train and once on test) :
## Pivoting it from the initial feature_name:feature_value form to a vector
## scalar_feature_to_dummies - Translating categoric variables into N-1 dummy variables
## timeseries_feature_slope_reduced - mean, std for time series variables (have multiple measurements in different times)
## timeseries_feature_last_value - take last value in time series
## Filling empty values with means - NOTE that these have to be the train data means

In [24]:
import pandas as pd
import numpy as np

In [25]:
df = pd.read_csv('../train_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
df.describe()

Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
count,1138647,1138647,1138647,1138635.0,727028,1136734.0
unique,1777,12,1337,36409.0,48,12952.0
top,25028,Lab Test,Q2_Salivation,4.0,mmol/L,0.0
freq,1069,615019,18879,83967.0,169840,74413.0


In [26]:
interesting = df[(df.form_name == 'Demographic') | (df.form_name == 'Vitals')]
print interesting['feature_name'].unique()
func_per_feature = {}
vectorized = pd.DataFrame(index=df['SubjectID'].unique())
print vectorized.shape

['Gender' 'Age' 'Race' 'bp_diastolic' 'bp_systolic' 'pulse'
 'respiratory_rate' 'temperature' 'weight' 'height' 'BMI']
(1777, 0)


In [27]:
def scalar_feature_to_dummies(df, feature_name):
    my_slice = df[df.feature_name == feature_name]
    my_slice_pivot = pd.pivot_table(my_slice, values = ['feature_value'], index = ['SubjectID'], 
                                columns = ['feature_name'], aggfunc = lambda x:x)
    dum = pd.get_dummies(my_slice_pivot['feature_value'][feature_name])
    return dum

for feature_name in ['Gender', 'Race']:
    func_per_feature[feature_name] = scalar_feature_to_dummies
    vectorized = pd.merge(vectorized, func_per_feature[feature_name](df, feature_name), how = 'left',
                          right_index=True, left_index=True)  

vectorized.head()


Unnamed: 0,F,M,Asian,Black,Hispanic,Other,Unknown,White
533,1,0,0,0,0,0,0,1
649,1,0,0,0,0,0,0,1
1234,0,1,0,0,0,0,0,1
2492,0,1,0,0,0,0,0,1
2956,1,0,0,0,0,0,0,1


In [28]:
### Calculating slope - the diffs between each measurement and the first measurement (0 day) 
def calc_slope(row) :
    time_delta =  (float(row['feature_delta_int_y']) - float(row['feature_delta_int_x']))
    return (row['feature_value_float_y'] - row['feature_value_float_x'])/time_delta

def timeseries_feature_to_slope(df, feature_name):
    my_slice = df[df.feature_name == feature_name]
    # There were duplicate measurements of timeseries features with the same feature_delta :(
    my_slice = my_slice.drop_duplicates(subset = ['SubjectID', 'feature_delta'], take_last=True)
    my_slice.loc[:, 'feature_value_float'] = my_slice['feature_value'].astype(float)
    my_slice.loc[:, 'feature_delta_int'] = my_slice['feature_delta'].astype(int)
    my_slice_other_visits = my_slice[(my_slice.feature_delta_int > 0) & (my_slice.feature_delta_int < 92)]
    my_slice_first_visit = my_slice[my_slice.feature_delta_int == 0]
    my_slice_j = pd.merge(my_slice_first_visit, my_slice_other_visits, on=['SubjectID','feature_name']) 
    my_slice_j.loc[:, 'feature_value_slope'] = my_slice_j.apply(calc_slope, axis=1)
    return my_slice_j

def timeseries_feature_slope_reduced(df, feature_name):
    res = pd.DataFrame(index=df['SubjectID'].unique())
    for func in ['mean', 'std']:
        slope_series = timeseries_feature_to_slope(df, feature_name)
        slope_pivot = pd.pivot_table(slope_series, values = ['feature_value_slope'], index = ['SubjectID'], 
                                     columns = ['feature_name'], aggfunc = func)
        slope_pivot = slope_pivot['feature_value_slope']
        slope_pivot.columns = [feature_name + "_slope_" + func]
        res = pd.merge(res, slope_pivot, right_index=True, left_index=True)          
    return res

for feature_name in ['bp_diastolic', 'bp_systolic', 'pulse', 'respiratory_rate', 'temperature', 'weight']:
    func_per_feature[feature_name] = timeseries_feature_slope_reduced
    vectorized = pd.merge(vectorized, func_per_feature[feature_name](df, feature_name), how='left',
                          right_index=True, left_index=True)  
    
vectorized.head()

Unnamed: 0,F,M,Asian,Black,Hispanic,Other,Unknown,White,bp_diastolic_slope_mean,bp_diastolic_slope_std,bp_systolic_slope_mean,bp_systolic_slope_std,pulse_slope_mean,pulse_slope_std,respiratory_rate_slope_mean,respiratory_rate_slope_std,temperature_slope_mean,temperature_slope_std,weight_slope_mean,weight_slope_std
533,1,0,0,0,0,0,0,1,-0.055548,0.288329,-0.078031,0.360149,0.036824,0.141342,-0.023547,0.035537,0.007123,0.010011,0.004337,0.020619
649,1,0,0,0,0,0,0,1,-0.0625,0.088388,-0.0625,0.088388,0.223361,0.037674,0.023822,0.010505,,,0.035425,0.03387
1234,0,1,0,0,0,0,0,1,-0.398557,0.615292,-0.116306,0.135799,0.337229,0.128436,0.050794,0.047935,,,0.012698,0.011252
2492,0,1,0,0,0,0,0,1,-0.294118,,-0.588235,,-0.117647,,0.0,,-0.017647,,0.0,
2956,1,0,0,0,0,0,0,1,0.340548,0.460288,-1.194805,0.821063,0.005772,0.376457,0.047619,0.082479,,,-0.032251,0.04045


In [29]:
def timeseries_feature_last_value(df, feature_name):
    my_slice = df[df.feature_name == feature_name]
    ret = my_slice.groupby('SubjectID').last().loc[:, ['feature_value']].astype(float)
    ret.columns = [feature_name + "_last"]
    return ret

for feature_name in ['BMI', 'height']:
    func_per_feature[feature_name] = timeseries_feature_last_value
    vectorized = pd.merge(vectorized, func_per_feature[feature_name](df, feature_name), how='left',
                          right_index=True, left_index=True)  
vectorized.head()

Unnamed: 0,F,M,Asian,Black,Hispanic,Other,Unknown,White,bp_diastolic_slope_mean,bp_diastolic_slope_std,...,pulse_slope_mean,pulse_slope_std,respiratory_rate_slope_mean,respiratory_rate_slope_std,temperature_slope_mean,temperature_slope_std,weight_slope_mean,weight_slope_std,BMI_last,height_last
533,1,0,0,0,0,0,0,1,-0.055548,0.288329,...,0.036824,0.141342,-0.023547,0.035537,0.007123,0.010011,0.004337,0.020619,,
649,1,0,0,0,0,0,0,1,-0.0625,0.088388,...,0.223361,0.037674,0.023822,0.010505,,,0.035425,0.03387,0.00299,157.0
1234,0,1,0,0,0,0,0,1,-0.398557,0.615292,...,0.337229,0.128436,0.050794,0.047935,,,0.012698,0.011252,0.003086,175.0
2492,0,1,0,0,0,0,0,1,-0.294118,,...,-0.117647,,0.0,,-0.017647,,0.0,,0.002976,174.0
2956,1,0,0,0,0,0,0,1,0.340548,0.460288,...,0.005772,0.376457,0.047619,0.082479,,,-0.032251,0.04045,,165.0


In [30]:
## Filling empty values with means - NOTE that these have to be the train data means
train_data_means = vectorized.mean()
vectorized = vectorized.fillna(train_data_means)
vectorized.head()


Unnamed: 0,F,M,Asian,Black,Hispanic,Other,Unknown,White,bp_diastolic_slope_mean,bp_diastolic_slope_std,...,pulse_slope_mean,pulse_slope_std,respiratory_rate_slope_mean,respiratory_rate_slope_std,temperature_slope_mean,temperature_slope_std,weight_slope_mean,weight_slope_std,BMI_last,height_last
533,1,0,0,0,0,0,0,1,-0.055548,0.288329,...,0.036824,0.141342,-0.023547,0.035537,0.007123,0.010011,0.004337,0.020619,0.002579,170.578946
649,1,0,0,0,0,0,0,1,-0.0625,0.088388,...,0.223361,0.037674,0.023822,0.010505,-0.000389,0.026839,0.035425,0.03387,0.00299,157.0
1234,0,1,0,0,0,0,0,1,-0.398557,0.615292,...,0.337229,0.128436,0.050794,0.047935,-0.000389,0.026839,0.012698,0.011252,0.003086,175.0
2492,0,1,0,0,0,0,0,1,-0.294118,0.291941,...,-0.117647,0.314618,0.0,0.08056,-0.017647,0.026839,0.0,0.036875,0.002976,174.0
2956,1,0,0,0,0,0,0,1,0.340548,0.460288,...,0.005772,0.376457,0.047619,0.082479,-0.000389,0.026839,-0.032251,0.04045,0.002579,165.0


In [42]:
# Calcualte ZScore for all columns
def calc_all_zscore(vectorized):
    for col in vectorized.columns:
        col_zscore = col + '_zscore'
        vectorized[col_zscore] = (vectorized[col] - vectorized[col].mean())/vectorized[col].std(ddof=0)


In [31]:
def parse_feature_delta(fd):
    if type(fd) is float: return fd
    first_value = fd.split(';')[0]
    try:
        return float(first_value)
    except:
        return None

## Run everything on `test` and `train`

In [45]:
for t in ["train", "test"]:
    df = pd.read_csv('../' + t + '_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
    df.loc[:,'feature_delta'] = df.feature_delta.apply(parse_feature_delta)
    df = df[df.feature_delta < 92]

    vectorized = pd.DataFrame(index=df['SubjectID'].unique())
    for feature_name, func in func_per_feature.iteritems():
        vectorized = pd.merge(vectorized, func_per_feature[feature_name](df, feature_name), how = 'left',
                      right_index=True, left_index=True)  
    final_data = vectorized.fillna(train_data_means)
    calc_all_zscore(final_data)
    
    final_data.index.name='SubjectID'
    print t, final_data.shape
    final_data.to_csv('../' + t + '_data_vectorized.csv' ,sep='|')

train (1777, 44)
test (600, 44)


In [37]:
func_per_feature

{'BMI': <function __main__.timeseries_feature_last_value>,
 'Gender': <function __main__.scalar_feature_to_dummies>,
 'Race': <function __main__.scalar_feature_to_dummies>,
 'bp_diastolic': <function __main__.timeseries_feature_slope_reduced>,
 'bp_systolic': <function __main__.timeseries_feature_slope_reduced>,
 'height': <function __main__.timeseries_feature_last_value>,
 'pulse': <function __main__.timeseries_feature_slope_reduced>,
 'respiratory_rate': <function __main__.timeseries_feature_slope_reduced>,
 'temperature': <function __main__.timeseries_feature_slope_reduced>,
 'weight': <function __main__.timeseries_feature_slope_reduced>}