## Builds all our models x-validated


In [1]:
from IPython.display import display

import pandas as pd
import numpy as np
import pickle
from sklearn.cluster import KMeans
from StringIO import StringIO
from sklearn import metrics
from sklearn.cross_validation import KFold

from vectorizing_funcs import *
from modeling_funcs import *


In [2]:
df = pd.read_csv('../all_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
slope = pd.read_csv('../all_slope.csv', sep = '|', index_col="SubjectID")
slope.index = slope.index.astype(str)

print "df: ", df.shape, df.SubjectID.unique().size
print "slope: ", slope.shape, slope.index.unique().size
display(df.head(2))
display(slope.head(2))

df:  (1642927, 6) 2424
slope:  (2424, 1) 2424


Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,Demographic,Gender,F,,0.0
1,533,Demographic,Age,65,,0.0


Unnamed: 0_level_0,ALSFRS_slope
SubjectID,Unnamed: 1_level_1
533,-0.965608
649,-0.921717


In [3]:
metadata = invert_func_to_features(ts_funcs_to_features, "ts")
metadata.update(invert_func_to_features(dummy_funcs_to_features, "dummy"))

In [4]:
clustering_columns = [u'Asian', u'Black', u'Hispanic', u'Other', u'Unknown', u'White',
       u'mouth_last', u'mouth_mean_slope',u'hands_last',
       u'hands_mean_slope',u'onset_delta_last', u'ALSFRS_Total_last',
       u'ALSFRS_Total_mean_slope',u'BMI_last', u'fvc_percent_mean_slope', 
                     u'respiratory_last', u'respiratory_mean_slope']

In [5]:
def apply_on_test(test_data, all_feature_metadata, train_data_means, train_data_std, 
                 clustering_columns, kmeans, best_features_per_cluster, model_per_cluster):
    
    # Vectorizing
    vectorized, _ = vectorize(test_data, all_feature_metadata)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    
    print "applying on: ", normalized.shape
    
    # Clustering
    
    for_clustering = normalized[clustering_columns]
    clusters = pd.DataFrame(index = for_clustering.index.astype(str))
    clusters['cluster'] = kmeans.predict(for_clustering)
    print "applied cluster cnt: ", np.bincount(clusters.cluster)

    X = normalized.join(clusters)
    
    buf = filter_only_selected_features(test_data.set_index("SubjectID"), clusters, \
                                        best_features_per_cluster)    
    s_df = pd.read_csv(StringIO(buf), sep='|', index_col=False, dtype='unicode')
    s_vectorized, _ = vectorize(s_df, all_feature_metadata)
    s_normalized, _ = normalize(s_vectorized, all_feature_metadata, train_data_means, train_data_std)    
    input_for_model = s_normalized.join(clusters)    
    
    pred = input_for_model.apply(apply_model, args=[model_per_cluster], axis = 1)
    return input_for_model, pred
    

In [6]:
def train_and_test(df, slope, all_feature_metadata, my_n_clusters=3):
    kf = KFold(df.SubjectID.unique().size, n_folds=3)
    fold, test_rmse, train_rmse = 0, 0.0, 0.0

    for train, test in kf:
        train_data = df[df.SubjectID.isin(df.SubjectID.unique()[train])]
        test_data = df[df.SubjectID.isin(df.SubjectID.unique()[test])]
        print
        print "*"*30
        print "fold: %d" % fold

        # Vectorizing
        all_feature_metadata = learn_to_dummies_model(train_data, all_feature_metadata)
        vectorized, all_feature_metadata = vectorize(train_data, all_feature_metadata)
        train_data_means = vectorized.mean()
        train_data_std = vectorized.std()            
        normalized, all_feature_metadata = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)

        print "train_data: ", normalized.shape
        
        # Clustering
        for_clustering = normalized[clustering_columns]
        kmeans = KMeans(init='k-means++', n_clusters=my_n_clusters)
        #Note we must convert to str to join with slope later
        clusters = pd.DataFrame(index = for_clustering.index.astype(str))
        clusters['cluster'] = kmeans.fit_predict(for_clustering)
        print "train cluster cnt: ", np.bincount(clusters.cluster)

        X = normalized.join(clusters)
        Y = slope.join(clusters)

        best_features_per_cluster = get_best_features_per_cluster(X, Y, all_feature_metadata)
        print "best_features_per_cluster: ", best_features_per_cluster 
        buf = filter_only_selected_features(train_data.set_index("SubjectID"), clusters, \
                                            best_features_per_cluster)

        s_df = pd.read_csv(StringIO(buf), sep='|', index_col=False, dtype='unicode')
        s_vectorized, _ = vectorize(s_df, all_feature_metadata)
        s_normalized, _ = normalize(s_vectorized, all_feature_metadata, train_data_means, train_data_std)    
        s_X = s_normalized.join(clusters)

        model_per_cluster = get_model_per_cluster(s_X, Y)

        input_for_model, pred = apply_on_test(train_data, all_feature_metadata, train_data_means, train_data_std, 
                     clustering_columns, kmeans, best_features_per_cluster, model_per_cluster)
        res = pred.join(slope)
        train_rmse += np.sqrt(np.mean((res.prediction - res.ALSFRS_slope) ** 2))

        input_for_model, pred = apply_on_test(test_data, all_feature_metadata, train_data_means, train_data_std, 
                     clustering_columns, kmeans, best_features_per_cluster, model_per_cluster)
        res = pred.join(slope)
        test_rmse += np.sqrt(np.mean((res.prediction - res.ALSFRS_slope) ** 2))
        
        input_for_model.to_csv('../x_results/test_%d_input_for_model.csv' % fold,sep='|')
        res.to_csv('../x_results/test_%d_prediction.csv' % fold,sep='|')

        fold += 1
        print "fold RMS Error train, test: ", train_rmse / fold, test_rmse / fold
            
    print "X-validated RMS Error train, test: ", train_rmse / kf.n_folds, test_rmse / kf.n_folds



In [7]:
for n_clusters in range(3, 6):
    print "*"*60
    print "*"*60
    train_and_test(df, slope, metadata, n_clusters)

************************************************************
************************************************************

******************************
fold: 0
train_data:  (1616, 157)
train cluster cnt:  [1030   78  508]
best_features_per_cluster:  {0: ['Age', 'BMI', 'Chloride', 'Gender', 'Race', 'Urine Ph'], 1: ['Age', 'BMI', 'Gender', 'Race', 'family_ALS_hist', 'if_use_Riluzole'], 2: ['BMI', 'Gamma-glutamyltransferase', 'Gender', 'Race', 'family_ALS_hist', 'pulse']}
cluster: 0 size: (1030L,)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s



	 RMS error (0 is perfect): 0.63
	 explained variance score (1 is perfect): 0.00
cluster: 1 size: (78L,)
	 RMS error (0 is perfect): 0.63
	 explained variance score (1 is perfect): 0.03
cluster: 2 size: (508L,)
	 RMS error (0 is perfect): 0.63
	 explained variance score (1 is perfect): 0.00
applying on:  (1616, 157)
applied cluster cnt:  [1031   78  507]
applying on:  (808, 157)
applied cluster cnt:  [501  39 268]
fold RMS Error train, test:  0.633152490872 0.609101670654

******************************
fold: 1
train_data:  (1616, 157)
train cluster cnt:  [1020   84  512]
best_features_per_cluster:  {0: ['ALT(SGPT)', 'BMI', 'Gender', 'Hematocrit', 'Race', 'Total Cholesterol'], 1: ['Age', 'BMI', 'Bilirubin (Total)', 'Race', 'Total Cholesterol', 'family_ALS_hist'], 2: ['BMI', 'Gender', 'Hematocrit', 'Race', 'family_ALS_hist', 'respiratory_rate']}
cluster: 0 size: (1020L,)
	 RMS error (0 is perfect): 0.61
	 explained variance score (1 is perfect): 0.02
cluster: 1 size: (84L,)
	 RMS error