In [7]:
# Used for selecting the 6 best features per cluster. 
# We're using mean squared error of each variable vs. the ALSFRS_score, and take the best 6. 

In [8]:
%matplotlib inline

import pandas as pd
import numpy as np
import vectorizer_beta
from sklearn import linear_model

In [9]:
proact_data = pd.read_csv('../train_data.csv', sep = '|', index_col=False)
slope = pd.read_csv('../train_slope.csv', sep = '|', index_col=False)
clusters = pd.read_csv('../train_kmeans_clusters.csv', sep = '|', index_col=False)
X = pd.merge(clusters, proact_data, on = "SubjectID")
Y = pd.merge(clusters, slope, on = "SubjectID")
print Y.shape, X.shape, clusters.shape
X.head()

(1777, 3) (1138647, 7) (1777, 2)


Unnamed: 0,SubjectID,cluster,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,1,Demographic,Gender,F,,0.0
1,533,1,Demographic,Age,65,,0.0
2,533,1,Demographic,Race,White,,0.0
3,533,1,ALSHX,onset_delta,-1023,,0.0
4,533,1,ALSHX,diag_delta,-44,,0.0


In [11]:
from vectorizer_beta import * 
best_features_per_cluster = {}

for c in clusters['cluster'].unique():
    seg_X, seg_Y = X[X['cluster'] == c], Y[Y['cluster'] == c]
    score_per_feature = {}
    for feature_name, func in func_per_feature.iteritems():
        seg_X_fam = func_per_feature[feature_name](seg_X, feature_name)
        seg_Y_fam = pd.merge(seg_Y, seg_X_fam, left_on = 'SubjectID', right_index = True, how='left')
        seg_Y_fam = seg_Y_fam.fillna(seg_Y_fam.mean())
        regr = linear_model.LinearRegression()
        seg_X_fam = seg_Y_fam.drop('ALSFRS_slope', 1)
        regr.fit(seg_X_fam, seg_Y_fam['ALSFRS_slope'])
        score_per_feature[feature_name] = np.mean((regr.predict(seg_X_fam) - seg_Y_fam['ALSFRS_slope']) ** 2)
    print c, score_per_feature
    best_features_per_cluster[c] = sorted(score_per_feature, key=score_per_feature.get)[:6]
best_features_per_cluster

1 {'temperature': 0.3845418955600234, 'bp_systolic': 0.38509927855070814, 'Gender': 0.3850766884486517, 'BMI': 0.3851839536533719, 'respiratory_rate': 0.3838858018882788, 'weight': 0.38225320629934706, 'bp_diastolic': 0.38433298606781463, 'Race': 0.3845406072167956, 'pulse': 0.38341813333131586, 'height': 0.385095625022134}
0 {'temperature': 0.4927732103602252, 'bp_systolic': 0.4734700221636955, 'Gender': 0.5018484286161546, 'BMI': 0.4979560772498027, 'respiratory_rate': 0.4989394493864685, 'weight': 0.4847309042983656, 'bp_diastolic': 0.4964759339194421, 'Race': 0.5028043041938676, 'pulse': 0.4788657280383959, 'height': 0.49402871258917425}
2 {'temperature': 0.09498297073102396, 'bp_systolic': 0.05005047157357084, 'Gender': 0.0676956102366475, 'BMI': 0.03177325441433329, 'respiratory_rate': 0.06799568367792125, 'weight': 0.06434255332019974, 'bp_diastolic': 0.04698032867012381, 'Race': 0.08727686060330674, 'pulse': 0.0564198763696585, 'height': 0.008560059758735887}


{0: ['bp_systolic',
  'pulse',
  'weight',
  'temperature',
  'height',
  'bp_diastolic'],
 1: ['weight',
  'pulse',
  'respiratory_rate',
  'bp_diastolic',
  'Race',
  'temperature'],
 2: ['height', 'BMI', 'bp_diastolic', 'bp_systolic', 'pulse', 'weight']}

In [12]:
import pickle 
with open("best_features_per_cluster.pickle", "wb") as output_file:
    pickle.dump(best_features_per_cluster, output_file)
