In [1]:
# Used for selecting the 6 best features per cluster. 
# Assumes data is vectorized + clustered.
# We're using simple f_regression score of each variable vs. the ALSFRS_score, and take the best 6. 

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression


In [3]:
proact_data = pd.read_csv('../train_data_vectorized.csv', sep = '|', index_col='SubjectID')
slope = pd.read_csv('../train_slope.csv', sep = '|', index_col='SubjectID')
clusters = pd.read_csv('../train_kmeans_clusters.csv', sep = '|', index_col='SubjectID')
X = pd.merge(proact_data, clusters, left_index = True, right_index = True)
Y = pd.merge(X, slope, left_index = True, right_index = True)

Y.groupby('cluster').mean()

Unnamed: 0_level_0,bp_diastolic,bp_systolic,height,pulse,respiratory_rate,temperature,weight,F,Asian,Black,Hispanic,Other,White,ALSFRS_slope
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,-0.254267,-0.585448,0.000185,-0.000556,0.007145,-0.000288,0.001296,0.359268,0.002288,0.011442,0.002288,0.0,0.95881,-0.721877
1,0.074213,0.142662,0.000274,0.025401,0.003991,-0.000342,0.005415,0.361506,0.009205,0.017573,0.005858,0.00251,0.949791,-0.72876
2,-0.344796,-6.433254,0.000125,-2.271834,0.275022,-0.017081,0.0593,0.333333,0.0,0.166667,0.0,0.0,0.833333,-0.787179


In [4]:
selector_per_cluster = {}
for c in clusters['cluster'].unique():
    seg_X, seg_Y = X[X['cluster'] == c], Y[Y['cluster'] == c]
    print c, seg_X.shape, seg_Y.shape
    selector = SelectKBest(f_regression, k=6)
    X_new = selector.fit_transform(seg_X, seg_Y['ALSFRS_slope'])
    print seg_X.columns[selector.get_support()]
    selector_per_cluster[c] = selector

0 (437, 14) (437, 15)
Index([u'height', u'temperature', u'F', u'Asian', u'Hispanic', u'White'], dtype='object')
1 (1195, 14) (1195, 15)
Index([u'bp_diastolic', u'pulse', u'weight', u'F', u'Asian', u'White'], dtype='object')
2 (6, 14) (6, 15)
Index([u'height', u'temperature', u'weight', u'F', u'Black', u'White'], dtype='object')


In [5]:
def calc(x):
    selector = selector_per_cluster[x['cluster']]
    d = {"feature_ " + str(i): v for i, v in enumerate(selector.transform(x)[0])}
    d['features_list'] = ';'.join(cur_X.columns[selector.get_support()])
    d['cluster'] = int(x['cluster'])
    return pd.Series(d)

for t in ['train', 'test']:
    cur_data = pd.read_csv('../' + t + '_data_vectorized.csv', sep = '|', index_col='SubjectID')
    cur_clusters = pd.read_csv('../' + t + '_kmeans_clusters.csv', sep = '|', index_col='SubjectID')
    cur_X = pd.merge(cur_data, cur_clusters, left_index = True, right_index = True)
    res = cur_X.apply(calc, axis = 1)
    res.to_csv('../' + t + '_selected_features.csv',sep='|')
    