## Used for selecting the 6 best features per cluster
* We're using mean squared error of each variable vs. the ALSFRS_score, and take the best 6. 

In [69]:
%matplotlib inline

import pandas as pd
import numpy as np
import pickle
from sklearn import linear_model
from IPython.display import display

In [17]:
vectorized_data = pd.read_csv('../train_data_vectorized.csv', sep='|', index_col=0)
slope = pd.read_csv('../train_slope.csv', sep = '|', index_col=0)
clusters = pd.read_csv('../train_kmeans_clusters.csv', sep = '|', index_col=0)
all_feature_metadata = pickle.load( open('../all_feature_metadata.pickle', 'rb') )

X = clusters.join(vectorized_data)
Y = clusters.join(slope)
X.head()


Unnamed: 0_level_0,cluster,family_ALS_hist_last,weight_mean,weight_median,weight_std,weight_pct_diff,weight_mean_slope,F,M,Age_last,...,bp_diastolic_std,bp_diastolic_pct_diff,bp_diastolic_mean_slope,Creatinine_mean,Creatinine_median,Creatinine_std,Creatinine_pct_diff,Creatinine_last,Creatinine_mean_slope,fvc_percent_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533,2,0,-1.620202,-1.617145,-0.435513,-0.232289,-0.002066,1.322796,-1.322796,0.941297,...,0.345707,0.099808,-0.109689,0.547815,0.558684,-1.276692,-0.0252,0.563661,0.175924,-1.864018
649,0,0,-0.086445,-0.100773,-0.163334,0.218932,0.466682,1.322796,-1.322796,-0.558473,...,-1.053767,-0.019317,-0.127591,-1.117773,-0.958224,-0.357095,-0.02328,-0.863703,0.449074,-0.03798
1234,1,0,1.246644,1.224432,-0.325885,0.340632,0.124007,-0.755549,0.755549,-1.440691,...,0.526932,0.462695,-0.993015,1.060304,1.064321,-1.276692,-0.0252,1.039449,0.175924,0.993322
2492,0,0,0.92449,0.916621,-0.796321,-0.047759,-0.06746,-0.755549,0.755549,0.764854,...,0.254585,-1.489507,-0.72406,-0.220918,-0.19977,0.023815,-0.022054,0.087873,0.437748,-0.361454
2956,2,0,-0.702551,-0.700194,-0.487343,-0.149486,-0.553742,1.322796,-1.322796,0.764854,...,0.127369,-0.186248,0.910351,-0.861529,-0.958224,-0.357095,-0.023579,-0.387915,0.21446,0.661195


In [18]:
best_features_per_cluster = {}

for c in clusters['cluster'].unique():
    seg_X, seg_Y = X[X['cluster'] == c], Y[Y['cluster'] == c]
    seg_Y = seg_Y.fillna(seg_Y.mean())
    
    score_per_feature = {}
    
    for feature, fm in all_feature_metadata.iteritems():
        regr = linear_model.LinearRegression()
        X_feature_fam = seg_X[list(fm["derived_features"])]
        regr.fit(X_feature_fam, seg_Y)
        score_per_feature[feature] = regr.score(X_feature_fam, seg_Y)
    
    best_features_per_cluster[c] = sorted(sorted(score_per_feature, key=score_per_feature.get)[:6])
    
best_features_per_cluster

{0: ['Age', 'BMI', 'Gender', 'family_ALS_hist', 'fvc_percent', 'height'],
 1: ['Age', 'BMI', 'Gender', 'Race', 'height', 'pulse'],
 2: ['Age', 'BMI', 'Gender', 'Race', 'family_ALS_hist', 'height']}

In [19]:
with open("../best_features_per_cluster.pickle", "wb") as output_file:
    pickle.dump(best_features_per_cluster, output_file)


#Apply the selector 
leave only the best features per cluster

In [98]:
for t in ["train", "test"]:
    print t
    df = pd.read_csv('../' + t + '_data.csv', sep = '|', index_col="SubjectID", dtype='unicode')
    print "df", df.shape
    clusters = pd.read_csv('../' + t + '_kmeans_clusters.csv', sep = '|', index_col="SubjectID")
    print "clusters", clusters.shape
    j = df.join(clusters)
    buf, is_first = "", True
    for c, features in best_features_per_cluster.iteritems():
        slice = j[j.cluster == c]
        selected = slice[slice.feature_name.isin(features)]
        print c, slice.shape, selected.shape
        buf += selected.to_csv(sep='|', header = is_first, columns=df.columns)
        is_first = False
    with open('../' + t + '_data_selected.csv','w') as f:
        f.write(buf)


train
df (1138647, 5)
clusters (1777, 1)
0 (391149, 6) (8260, 6)
1 (341999, 6) (7368, 6)
2 (405499, 6) (3051, 6)
test
df (126664, 5)
clusters (600, 1)
0 (43544, 6) (1567, 6)
1 (33947, 6) (1312, 6)
2 (49173, 6) (1099, 6)


## Run selector.sh
As specified in the challenge - we must run our selector logic subject by subject.

The output_file_path must have the following format:
* First line: the cluster identifier for that patient
* Following lines: the selected features selected for that specific single patient, using the same format as the input data. A maximum of 6 features are allowed.

In [10]:
import pickle
import pandas as pd
from vectorizing_funcs import *

all_feature_metadata = pickle.load( open('../all_feature_metadata.pickle', 'rb') )
train_data_means = pickle.load( open('../train_data_means.pickle', 'rb') )
train_data_std = pickle.load( open('../train_data_std.pickle', 'rb') )
clustering_model = pickle.load( open('../clustering_model.pickle', 'rb') )
best_features_per_cluster = pickle.load( open('../best_features_per_cluster.pickle', 'rb') )


t = "test"
df = pd.read_csv('../' + t + '_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
for subj in df.SubjectID.unique()[:3]:
    df_subj = df[df.SubjectID == subj]
    vectorized, _ = vectorize(df_subj, all_feature_metadata)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    cluster_data = normalized[clustering_model["columns"]]
    c = clustering_model["model"].predict(cluster_data)[0]
    buf = "cluster: %d\n" % c
    selected = df_subj[df_subj.feature_name.isin(best_features_per_cluster[c])]
    buf += selected.to_csv(sep='|', index = False, header = False)
    print buf
    with open('../selected_' + subj + ".txt", "wb") as f:
        f.write(buf)


cluster: 0
750059|Demographic|Gender|M||0.0
750059|Demographic|Age|54||0.0
750059|Vitals|height|195.58|cm|0.0
750059|Vitals|BMI|0.00282223686053039||0.0

cluster: 2
750094|Demographic|Gender|F||0.0
750094|Demographic|Age|64||0.0
750094|Demographic|Race|White||0.0
750094|Vitals|height|158.0|cm|0.0
750094|Vitals|height|158.0|cm|8.0

cluster: 2
750148|Demographic|Gender|F||0.0
750148|Demographic|Age|67||0.0
750148|Demographic|Race|White||0.0
750148|Vitals|height|160.0|cm|0.0
750148|Vitals|BMI|0.0022421875||0.0

