In [1]:
# Used for selecting the 6 best features per cluster. 
# Assumes data is vectorized + clustered.
# We're using simple f_regression score of each variable vs. the ALSFRS_score, and take the best 6. 

In [29]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn import linear_model


In [6]:
proact_data = pd.read_csv('../train_data_vectorized.csv', sep = '|', index_col='SubjectID')
slope = pd.read_csv('../train_slope.csv', sep = '|', index_col='SubjectID')
clusters = pd.read_csv('../train_kmeans_clusters.csv', sep = '|', index_col='SubjectID')
X = pd.merge(proact_data, clusters, left_index = True, right_index = True)
Y = pd.merge(X, slope, left_index = True, right_index = True)

Y.groupby('cluster').mean()

Unnamed: 0_level_0,mean_bp_diastolic::bp_diastolic,mean_bp_systolic::bp_systolic,mean_pulse::pulse,mean_respiratory_rate::respiratory_rate,mean_temperature::temperature,mean_weight::weight,F::Gender,Asian::Race,Black::Race,Hispanic::Race,Other::Race,White::Race,std_bp_diastolic::bp_diastolic,std_bp_systolic::bp_systolic,std_pulse::pulse,std_respiratory_rate::respiratory_rate,std_temperature::temperature,std_weight::weight,ALSFRS_slope
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,-0.00352,-0.023442,0.015051,0.00727,-0.000151,0.003354,0.362667,0.007333,0.013333,0.005333,0.002,0.954,0.22372,0.307269,0.24574,0.07283,0.026855,0.032651,-0.720537
1,-0.344796,-6.433254,-2.267751,0.277452,-0.017011,0.0593,0.333333,0.0,0.166667,0.0,0.0,0.833333,2.16247,10.379332,3.750531,1.221659,0.048229,0.23036,-0.787179
2,-0.145575,-0.400318,0.08065,0.000151,-0.001343,0.01448,0.340909,0.007576,0.045455,0.0,0.0,0.931818,0.973415,1.655631,0.942842,0.143823,0.035025,0.074933,-0.79942


In [34]:
selector_per_cluster = {}

for c in clusters['cluster'].unique():
    seg_X, seg_Y = X[X['cluster'] == c], Y[Y['cluster'] == c]
    families = np.unique([c.split('::')[1] for c in seg_X.columns if len(c.split('::')) > 1])
    score_per_family = {}
    for family in families:
        mask = [family in col for col in seg_X.columns]
        seg_X_fam = seg_X.loc[:, mask]
        regr = linear_model.LinearRegression()
        # Train the model using the training sets
        regr.fit(seg_X_fam, seg_Y['ALSFRS_slope'])
        score_per_family[family] = np.mean((regr.predict(seg_X_fam) - seg_Y['ALSFRS_slope']) ** 2)
    print c, score_per_family 
    selector = SelectKBest(f_regression_for_families, k=6)
    X_new = selector.fit_transform(seg_X, seg_Y['ALSFRS_slope'])
    print seg_X.columns[selector.get_support()]
    selector_per_cluster[c] = selector

cluster {'weight': 0.3897324857659086, 'temperature': 0.3918421916618879, 'bp_systolic': 0.3905492205009711, 'Gender': 0.3918307031455457, 'respiratory_rate': 0.39156142245485337, 'pulse': 0.39134164946700245, 'bp_diastolic': 0.3912357487618163, 'Race': 0.3903296836596553}
['Gender' 'Race' 'bp_diastolic' 'bp_systolic' 'pulse' 'respiratory_rate'
 'temperature' 'weight']
(array([ 1.08995888,  3.50089175,  0.6487022 ,  0.57029641,  0.15793072,
        1.99946743,  0.20959013,  2.60519321,  1.33789284,  0.59719787,
        1.30327603,  0.58348693,  1.72388759,  1.95479402,  1.50717001,
        0.56228427,  0.14278518,  7.36803441,         nan]), array([ 0.29664876,  0.0615306 ,  0.42070488,  0.45025987,  0.69112546,
        0.15756228,  0.64715524,  0.10672478,  0.24759033,  0.43977108,
        0.25379814,  0.44506908,  0.18939449,  0.16227815,  0.2197643 ,
        0.45345908,  0.70558125,  0.00671553,         nan]))
Index([u'mean_bp_systolic::bp_systolic', u'mean_weight::weight',
       u

In [5]:
def calc(x):
    selector = selector_per_cluster[x['cluster']]
    d = {"feature_ " + str(i): v for i, v in enumerate(selector.transform(x)[0])}
    d['features_list'] = ';'.join(cur_X.columns[selector.get_support()])
    d['cluster'] = int(x['cluster'])
    return pd.Series(d)

for t in ['train', 'test']:
    cur_data = pd.read_csv('../' + t + '_data_vectorized.csv', sep = '|', index_col='SubjectID')
    cur_clusters = pd.read_csv('../' + t + '_kmeans_clusters.csv', sep = '|', index_col='SubjectID')
    cur_X = pd.merge(cur_data, cur_clusters, left_index = True, right_index = True)
    res = cur_X.apply(calc, axis = 1)
    res.to_csv('../' + t + '_selected_features.csv',sep='|')
    