## Builds all our models x-validated


In [1]:
from IPython.display import display

import pandas as pd
import numpy as np
import pickle, cPickle
from sklearn.cluster import KMeans
from StringIO import StringIO
from sklearn import metrics
from sklearn.cross_validation import KFold

from vectorizing_funcs import *
from modeling_funcs import *


In [2]:
df = pd.read_csv('../all_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
slope = pd.read_csv('../all_slope.csv', sep = '|', index_col="SubjectID")
slope.index = slope.index.astype(str)

print "df: ", df.shape, df.SubjectID.unique().size
print "slope: ", slope.shape, slope.index.unique().size
display(df.head(2))
display(slope.head(2))

df:  (1514606, 6) 2205
slope:  (2205, 1) 2205


Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,Demographic,Gender,F,,0.0
1,533,Demographic,Age,65,,0.0


Unnamed: 0_level_0,ALSFRS_slope
SubjectID,Unnamed: 1_level_1
533,-0.965608
649,-0.921717


In [3]:
from datetime import datetime

def train_and_test(df, slope, my_n_clusters=2):
    kf = KFold(df.SubjectID.unique().size, n_folds=3)
    fold, test_rmse, train_rmse, fold_test_rmse, fold_train_rmse = 0, 0.0, 0.0, 0.0, 0.0

    for train, test in kf:
        train_data = df[df.SubjectID.isin(df.SubjectID.unique()[train])]
        test_data = df[df.SubjectID.isin(df.SubjectID.unique()[test])]
        print
        print "*"*30
        print "fold: %d" % fold
        tick = datetime.now()
        
        all_feature_metadata, \
                    train_data_means, train_data_std, train_data_medians, train_data_mads, \
                    bins, forest, best_features_per_cluster, model_per_cluster = train_it(train_data, slope, my_n_clusters)

        input_for_model, pred = apply_on_test(train_data, all_feature_metadata, 
                    train_data_means, train_data_std, train_data_medians, train_data_mads,
                    clustering_columns, bins, forest, best_features_per_cluster, model_per_cluster)
        res = pred.join(slope)
        fold_train_rmse = np.sqrt(np.mean((res.prediction - res.ALSFRS_slope) ** 2))

        input_for_model, pred = apply_on_test(test_data, all_feature_metadata, 
                    train_data_means, train_data_std, train_data_medians, train_data_mads, 
                    clustering_columns, bins, forest, best_features_per_cluster, model_per_cluster)
        res = pred.join(slope)
        fold_test_rmse = np.sqrt(np.mean((res.prediction - res.ALSFRS_slope) ** 2))

        input_for_model.to_csv('../x_results/test_%d_input_for_model.csv' % fold,sep='|')
        res.to_csv('../x_results/test_%d_prediction.csv' % fold,sep='|')

        fold += 1
        print "fold RMS Error train, test: ", fold_train_rmse, fold_test_rmse
        print 'pearson correlation r = %.2f ' % scipy.stats.pearsonr(res.prediction, res.ALSFRS_slope)[0]
        train_rmse += fold_train_rmse
        test_rmse += fold_test_rmse

        tock = datetime.now()   
        diff = tock - tick 
        print "minutes for fold: ", diff.seconds / 60

            
    print "X-validated RMS Error train, test: ", train_rmse / kf.n_folds, test_rmse / kf.n_folds



In [4]:
for n_clusters in range(5, 0, -1):
    print "*"*60
    print "*"*60
    train_and_test(df, slope, n_clusters)

************************************************************
************************************************************

******************************
fold: 0
train_data:  (1470, 140) (1470, 1)
train cluster cnt:  [294 294 294 294 294]
cluster: 0 with size: (294, 141) with mean target: -1.07988703458 std: 0.660761490094
best we can do with all features: 0.636340434777
using model: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=60,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)
adding best family: ('fvc_percent', 0.64615075469100747) time: 15.125
adding best family: ('pulse', 0.64264245522858809) time: 17.510999918
adding best family: ('mouth', 0.64045881568729512) time: 18.0130000114
adding best family: ('Creatinine', 0.63901230645177232) time: 18.5739998817
adding

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s



	 RMS error (0 is perfect): 0.63
	 coefficient of determination R^2 = 0.07 
	 pearson correlation r = 0.29 
3 sample predictions:  [-1.04694262 -1.22071783 -1.31534189]
applying on:  (1470, 140)
applied cluster cnt:  [293 294 295 294 294]
applying on:  (735, 140)
applied cluster cnt:  [161 144 153 131 146]
fold RMS Error train, test:  0.514543464529 0.530640884792
pearson correlation r = 0.39 
minutes for fold:  17

******************************
fold: 1
train_data:  (1470, 140) (1470, 1)
train cluster cnt:  [294 294 294 294 294]
cluster: 0 with size: (294, 141) with mean target: -1.1373770988 std: 0.634231076975
best we can do with all features: 0.611119611614
using model: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=60,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=Fal

