In [1]:
# Used for predicting ALSFRS_slope (see https://www.synapse.org/#!Synapse:syn2873386/wiki/)
# Assumed data is vectorized + clustered + 6 features were selected

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [3]:
X = pd.read_csv('../train_selected_features.csv', sep = '|', index_col='SubjectID')
X = X.drop('features_list', 1)
slope = pd.read_csv('../train_slope.csv', sep = '|', index_col='SubjectID')
Y = pd.merge(X, slope, left_index = True, right_index = True)

Y.describe()

Unnamed: 0,cluster,feature_ 0,feature_ 1,feature_ 2,feature_ 3,feature_ 4,feature_ 5,ALSFRS_slope
count,1638.0,1638.0,1638.0,1638.0,1638.0,1638.0,1638.0,1638.0
mean,0.736874,0.054192,0.018392,0.100017,0.265568,0.007937,0.95177,-0.727138
std,0.448709,0.256597,0.290147,0.297493,0.44177,0.08876,0.214316,0.630951
min,0.0,-1.761806,-1.733494,-0.428571,0.0,0.0,0.0,-4.055556
25%,0.0,-0.014956,-0.045438,-0.003771,0.0,0.0,1.0,-1.08631
50%,1.0,0.00025,-0.000389,0.0,0.0,0.0,1.0,-0.620748
75%,1.0,0.111111,0.073664,0.033991,1.0,0.0,1.0,-0.250859
max,2.0,3.027778,3.714286,1.0,1.0,1.0,1.0,1.207011


In [4]:
model_per_cluster = {}
for c in Y['cluster'].unique():
    seg_X, seg_Y = X[X['cluster'] == c], Y[Y['cluster'] == c]
    print c, seg_Y.shape
    # Create linear regression object
    regr = linear_model.LinearRegression()
    # Train the model using the training sets
    regr.fit(seg_X, seg_Y['ALSFRS_slope'])

    # The coefficients
    print('Coefficients: \n', regr.coef_)
    # The mean square error
    print "Mean square error: %.2f" % np.mean((regr.predict(seg_X) - seg_Y['ALSFRS_slope']) ** 2)
    # Explained variance score: 1 is perfect prediction
    print('Explained variance score: %.2f' % regr.score(seg_X, seg_Y['ALSFRS_slope']))

    model_per_cluster[c] = regr

print regr.predict(seg_X)[:6]
print model_per_cluster

1 (1195, 8)
('Coefficients: \n', array([ 0.        , -0.05231892, -0.02653737,  0.13616179, -0.01730646,
        0.25493997, -0.02443039]))
Mean square error: 0.41
Explained variance score: 0.00
0 (437, 8)
('Coefficients: \n', array([  0.00000000e+00,   8.83265557e+02,  -1.57861954e+00,
         1.17047755e-01,  -3.84324697e-01,  -7.86331952e-02,
         1.11785330e-01]))
Mean square error: 0.35
Explained variance score: 0.04
2 (6, 8)
('Coefficients: \n', array([  4.34552786e-11,   7.41210184e+02,   1.15438563e+01,
         9.37355322e-01,   6.13154719e-01,  -1.60010563e-01,
         1.60010563e-01]))
Mean square error: 0.00
Explained variance score: 1.00
[-1.08630952 -1.00274725 -0.91067864 -0.09907709 -0.73293173 -0.89133089]
{0: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False), 1: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False), 2: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)}


In [5]:
def calc(x):
    model = model_per_cluster[x['cluster']]
    pred = float(model.predict(x.drop('features_list')))
    return pred

for t in ['train', 'test']:
    X = pd.read_csv('../' + t + '_selected_features.csv', sep = '|', index_col='SubjectID')
    X.loc[:, 'prediction'] = X.apply(calc, axis = 1)
    X.to_csv('../' + t + '_prediction.csv',sep='|', columns=['prediction', 'cluster', 'features_list'])
