In [None]:
# Used for selecting the 6 best features per cluster. 
# Assumes data is vectorized + clustered.
# We're using simple f_regression score of each variable vs. the ALSFRS_score, and take the best 6. 

In [75]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression


In [90]:
proact_data = pd.read_csv('../train_data_vectorized.csv', sep = '|', index_col='SubjectID')
slope = pd.read_csv('../train_slope.csv', sep = '|', index_col='SubjectID')
clusters = pd.read_csv('../train_kmeans_clusters.csv', sep = '|', index_col='SubjectID')
X = pd.merge(proact_data, clusters, left_index = True, right_index = True)
Y = pd.merge(X, slope, left_index = True, right_index = True)

Y.groupby('cluster').mean()

Unnamed: 0_level_0,bp_diastolic,bp_systolic,height,pulse,respiratory_rate,temperature,weight,F,Asian,Black,Hispanic,Other,White,ALSFRS_slope
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.021828,0.033884,0.000267,0.014863,0.003425,-0.000153,-0.002826,0.355047,0.009306,0.015748,0.005011,0.002147,0.950608,-0.724901
1,-0.114042,-0.255303,0.000186,0.003287,0.003497,-7.7e-05,-0.004754,0.391781,0.00274,0.010959,0.00274,0.0,0.958904,-0.764811
2,-0.062423,-1.828897,0.00015,-0.479034,0.091067,0.000267,0.016666,0.4,0.0,0.1,0.0,0.0,0.9,-0.829612


In [98]:
selector_per_cluster = {}
for c in clusters['cluster'].unique():
    seg_X, seg_Y = X[X['cluster'] == c], Y[Y['cluster'] == c]
    print c, seg_X.shape, seg_Y.shape
    selector = SelectKBest(f_regression, k=6)
    X_new = selector.fit_transform(seg_X, seg_Y['ALSFRS_slope'])
    print seg_X.columns[selector.get_support()]
    selector_per_cluster[c] = selector

1 (365, 14) (365, 15)
Index([u'height', u'respiratory_rate', u'temperature', u'F', u'Asian',
       u'White'],
      dtype='object')
0 (1397, 14) (1397, 15)
Index([u'bp_diastolic', u'pulse', u'respiratory_rate', u'F', u'Asian',
       u'Black'],
      dtype='object')
2 (10, 14) (10, 15)
Index([u'bp_systolic', u'height', u'weight', u'F', u'Black', u'White'], dtype='object')


In [141]:
def calc(x):
    selector = selector_per_cluster[x['cluster']]
    d = {"feature_ " + str(i): v for i, v in enumerate(selector.transform(x)[0])}
    d['features'] = ';'.join(cur_X.columns[selector.get_support()])
    d['cluster'] = int(x['cluster'])
    return pd.Series(d)

for t in ['train', 'test']:
    cur_data = pd.read_csv('../' + t + '_data_vectorized.csv', sep = '|', index_col='SubjectID')
    cur_clusters = pd.read_csv('../' + t + '_kmeans_clusters.csv', sep = '|', index_col='SubjectID')
    cur_X = pd.merge(cur_data, cur_clusters, left_index = True, right_index = True)
    res = cur_X.apply(calc, axis = 1)
    res.to_csv('../' + t + '_selected_features.csv',sep='|')
    

In [94]:
# Whoa! unexpected behavior. 
# http://stackoverflow.com/questions/31762453/after-slicing-or-filtering-a-pandas-dataframe-f-regression-behaves-not-as-expec
seg = Y
print seg.shape, f_regression(seg['ALSFRS_slope'], seg['ALSFRS_slope'])[1]
seg = Y[Y['cluster'] == 0]
print seg.shape, f_regression(seg['ALSFRS_slope'], seg['ALSFRS_slope'])[1]
seg = Y[:1000]
print seg.shape, f_regression(seg['ALSFRS_slope'], seg['ALSFRS_slope'])[1]

(1772, 15) [ 1.]
(1397, 15) [ 0.]
(1000, 15) [ 0.]
