## Run selector.sh
As specified in the challenge - we must run our selector logic subject by subject.

The output_file_path must have the following format:
* First line: the cluster identifier for that patient
* Following lines: the selected features selected for that specific single patient, using the same format as the input data. A maximum of 6 features are allowed.

In [1]:
import pickle
import cPickle
import pandas as pd
import sys
from vectorizing_funcs import *

if "IPython" not in sys.argv[0]:
    models_folder, input_file, output_file= sys.argv[1], sys.argv[2], sys.argv[3]
else:
    models_folder, input_file, output_file= "../", "../19871.txt", "../selected_19871.txt"

all_feature_metadata = pickle.load( open(models_folder + '/all_feature_metadata.pickle', 'rb') )
train_data_means = pickle.load( open(models_folder + '/all_data_means.pickle', 'rb') )
train_data_std = pickle.load( open(models_folder + '/all_data_std.pickle', 'rb') )
train_data_medians = pickle.load( open(models_folder + '/all_data_medians.pickle', 'rb') )
train_data_mads = pickle.load( open(models_folder + '/all_data_mads.pickle', 'rb') )
clustering_model = cPickle.load( open(models_folder + '/forest_clustering_model.pickle', 'rb') )
best_features_per_cluster = pickle.load( open(models_folder + '/best_features_per_cluster.pickle', 'rb') )
   
df = pd.read_csv(input_file, sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
for subj in df.SubjectID.unique()[:3]:
    df_subj = df[df.SubjectID == subj]

    vectorized, _ = vectorize(df_subj, all_feature_metadata)
    cleaned = clean_outliers(vectorized, all_feature_metadata, train_data_medians, train_data_mads, train_data_std)    
    normalized, _ = normalize(cleaned, all_feature_metadata, train_data_means, train_data_std)
    
    c = np.digitize(clustering_model["model"].predict(normalized), clustering_model["bins"])[0]
    #cluster_data = normalized[clustering_model["columns"]]
    #c = clustering_model["model"].predict(cluster_data)[0]
    buf = "cluster: %d\n" % c
    selected = df_subj[df_subj.feature_name.isin(best_features_per_cluster[c])]
    buf += selected.to_csv(sep='|', index = False, header = False)
    print buf
    with open(output_file, "wb") as f:
        f.write(buf)


cluster: 0
19871|Lab Test|Blood Urea Nitrogen (BUN)|5.1114|mmol/L|55.0
19871|Lab Test|Blood Urea Nitrogen (BUN)|4.7463|mmol/L|0.0
19871|Lab Test|Blood Urea Nitrogen (BUN)|6.2067|mmol/L|198.0
19871|Lab Test|Blood Urea Nitrogen (BUN)|6.2067|mmol/L|329.0
19871|Lab Test|Blood Urea Nitrogen (BUN)|4.7463|mmol/L|378.0
19871|Lab Test|Blood Urea Nitrogen (BUN)|5.8416|mmol/L|247.0
19871|Lab Test|Blood Urea Nitrogen (BUN)|6.2067|mmol/L|70.0
19871|Lab Test|Blood Urea Nitrogen (BUN)|3.87006|mmol/L|107.0
19871|Lab Test|Blood Urea Nitrogen (BUN)|6.5718|mmol/L|135.0
19871|Lab Test|Creatinine|61.88|umol/L|247.0
19871|Lab Test|Creatinine|70.72|umol/L|107.0
19871|Lab Test|Creatinine|70.72|umol/L|55.0
19871|Lab Test|Creatinine|70.72|umol/L|198.0
19871|Lab Test|Creatinine|79.56|umol/L|70.0
19871|Lab Test|Creatinine|79.56|umol/L|0.0
19871|Lab Test|Creatinine|70.72|umol/L|135.0
19871|Lab Test|Creatinine|61.88|umol/L|329.0
19871|Lab Test|Creatinine|61.88|umol/L|378.0
19871|FVC|fvc_percent|23.3874712467199|%|3