# Used for predicting ALSFRS_slope
see https://www.synapse.org/#!Synapse:syn2873386/wiki/ .
We assumed data is vectorized + clustered + 6 features were selected

In [63]:
import pandas as pd
import numpy as np
import pickle
from sklearn import linear_model
from vectorizing_funcs import *

## Revectorize the selected data
We now reload the metadata and the 6 attributes selected per cluster

In [64]:
all_feature_metadata = pickle.load( open('../all_feature_metadata.pickle', 'rb') )
train_data_means = pickle.load( open('../train_data_means.pickle', 'rb') )
train_data_std = pickle.load( open('../train_data_std.pickle', 'rb') )
best_features_per_cluster = pickle.load( open('../best_features_per_cluster.pickle', 'rb') )


df = pd.read_csv('../train_data_selected.csv', sep='|', index_col=False)
vectorized, _ = vectorize(df, all_feature_metadata)
normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
print normalized.shape
normalized.head()

(1777, 62)


Unnamed: 0_level_0,family_ALS_hist_last,weight_mean,weight_median,weight_std,weight_pct_diff,weight_mean_slope,F,M,Age_last,respiratory_rate_mean,...,bp_diastolic_std,bp_diastolic_pct_diff,bp_diastolic_mean_slope,Creatinine_mean,Creatinine_median,Creatinine_std,Creatinine_pct_diff,Creatinine_last,Creatinine_mean_slope,fvc_percent_pct_diff
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
649,0,0,0,0,0,0,1.322796,-1.322796,-0.558473,0,...,0,0,0,0,0,0,0,0,0,-0.03798
2492,0,0,0,0,0,0,-0.755549,0.755549,0.764854,0,...,0,0,0,0,0,0,0,0,0,-0.361454
3085,0,0,0,0,0,0,-0.755549,0.755549,-1.617135,0,...,0,0,0,0,0,0,0,0,0,0.185905
3551,0,0,0,0,0,0,-0.755549,0.755549,0.059079,0,...,0,0,0,0,0,0,0,0,0,0.230714
4390,0,0,0,0,0,0,-0.755549,0.755549,-1.528913,0,...,0,0,0,0,0,0,0,0,0,0.253


In [65]:
slope = pd.read_csv('../train_slope.csv', sep = '|', index_col="SubjectID")
clusters = pd.read_csv('../train_kmeans_clusters.csv', sep = '|', index_col="SubjectID")

X = normalized.join(clusters)
Y = slope.join(clusters)

print Y.shape, X.shape, clusters.shape

(1777, 2) (1777, 63) (1777, 1)


## Train a prediction model per cluster

In [66]:
model_per_cluster = {}

for c in clusters.cluster.unique():    
    X_cluster = X[X.cluster==c]
    Y_cluster = Y[Y.cluster == c].ALSFRS_slope
    regr = linear_model.LinearRegression()
    regr.fit(X_cluster, Y_cluster)

    print 'cluster: %d size: %s' % (c, Y_cluster.shape)
    print "Mean square error (0 is perfect): %.2f" % np.mean(
        (regr.predict(X_cluster) - Y_cluster) ** 2)
    print('Explained variance score (1 is perfect): %.2f' % regr.score(X_cluster, Y_cluster))
    print ""
    model_per_cluster[c] = {"train_data_means": X_cluster.mean(), "model" : regr}
    
    

cluster: 2 size: (622L,)
Mean square error (0 is perfect): 0.40
Explained variance score (1 is perfect): 0.01

cluster: 0 size: (617L,)
Mean square error (0 is perfect): 0.42
Explained variance score (1 is perfect): 0.01

cluster: 1 size: (538L,)
Mean square error (0 is perfect): 0.33
Explained variance score (1 is perfect): 0.03



In [67]:
with open("../model_per_cluster.pickle", "wb") as output_file:
    pickle.dump(model_per_cluster, output_file)


## Apply the model on both `train` and `test`

In [68]:
def calc(x):
    c = x['cluster']
    model = model_per_cluster[c]['model']
    pred = float(model.predict(x))
    return pd.Series({'SubjectID': int(x.name), 'prediction':pred, 'cluster': int(c), 'features_list': ";".join(best_features_per_cluster[c])})

for t in ['train', 'test']:
    df = pd.read_csv('../' + t + '_data_selected.csv', sep='|', index_col=False)
    vectorized, _ = vectorize(df, all_feature_metadata)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    
    clusters = pd.read_csv('../' + t + '_kmeans_clusters.csv', sep = '|', index_col=0)
    X = normalized.join(clusters)
    pred = X.apply(calc, axis = 1)
    pred = pred.set_index('SubjectID')
    pred.to_csv('../' + t + '_prediction.csv',sep='|')

pred.head()

Unnamed: 0_level_0,cluster,features_list,prediction
SubjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
750059,0,Age;BMI;Gender;family_ALS_hist;fvc_percent;height,-0.855838
750195,0,Age;BMI;Gender;family_ALS_hist;fvc_percent;height,-0.756017
750406,0,Age;BMI;Gender;family_ALS_hist;fvc_percent;height,-0.571112
755652,0,Age;BMI;Gender;family_ALS_hist;fvc_percent;height,-0.628712
756895,0,Age;BMI;Gender;family_ALS_hist;fvc_percent;height,-0.728397
