## Builds all our models x-validated


In [6]:
from IPython.display import display

import pandas as pd
import numpy as np
import pickle
from sklearn.cluster import KMeans
from StringIO import StringIO
from sklearn import metrics

from vectorizing_funcs import *
from modeling_funcs import *


In [7]:
df = pd.read_csv('../all_data.csv', sep = '|', error_bad_lines=False, index_col=False, dtype='unicode')
slope = pd.read_csv('../all_slope.csv', sep = '|', index_col="SubjectID")
slope.index = slope.index.astype(str)

print "df: ", df.shape, df.SubjectID.unique().size
print "slope: ", slope.shape, slope.index.unique().size
display(df.head(2))
display(slope.head(2))

df:  (1642927, 6) 2424
slope:  (2424, 1) 2424


Unnamed: 0,SubjectID,form_name,feature_name,feature_value,feature_unit,feature_delta
0,533,Demographic,Gender,F,,0.0
1,533,Demographic,Age,65,,0.0


Unnamed: 0_level_0,ALSFRS_slope
SubjectID,Unnamed: 1_level_1
533,-0.965608
649,-0.921717


In [8]:
all_feature_metadata = invert_func_to_features(ts_funcs_to_features, "ts")
all_feature_metadata.update(invert_func_to_features(dummy_funcs_to_features, "dummy"))

In [9]:
def apply_on_test(test_data, all_feature_metadata, train_data_means, train_data_std, 
                 clustering_columns, kmeans, best_features_per_cluster, model_per_cluster):
    
    # Vectorizing
    vectorized, _ = vectorize(test_data, all_feature_metadata)
    normalized, _ = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    
    # Clustering
    for_clustering = normalized[clustering_columns]
    clusters = pd.DataFrame(index = for_clustering.index.astype(str))
    clusters['cluster'] = kmeans.predict(for_clustering)
    print sorted([(metrics.adjusted_mutual_info_score(for_clustering[col], kmeans.labels_), col) \
                  for col in for_clustering.columns])[-5:]

    X = normalized.join(clusters)
    
    buf = filter_only_selected_features(test_data.set_index("SubjectID"), clusters, \
                                        best_features_per_cluster)    
    s_df = pd.read_csv(StringIO(buf), sep='|', index_col=False, dtype='unicode')
    s_vectorized, _ = vectorize(s_df, all_feature_metadata)
    s_normalized, _ = normalize(s_vectorized, all_feature_metadata, train_data_means, train_data_std)    
    s_X = s_normalized.join(clusters)    

    pred = s_X.apply(apply_model, args=[model_per_cluster], axis = 1)
    return pred
    

In [10]:
from sklearn.cross_validation import KFold
kf = KFold(df.SubjectID.unique().size, n_folds=2)
fold = 0
for train, test in kf:
    train_data = df[df.SubjectID.isin(df.SubjectID.unique()[train])]
    test_data = df[df.SubjectID.isin(df.SubjectID.unique()[test])]
    print "fold: %d" % fold
    print "train_data: ", train_data.shape, train_data.SubjectID.unique().size, \
            train_data.SubjectID.min(), train_data.SubjectID.max()
    
    # Vectorizing
    all_feature_metadata = learn_to_dummies_model(train_data, all_feature_metadata)
    vectorized, all_feature_metadata = vectorize(train_data, all_feature_metadata)
    train_data_means = vectorized.mean()
    train_data_std = vectorized.std()            
    normalized, all_feature_metadata = normalize(vectorized, all_feature_metadata, train_data_means, train_data_std)
    
    # Clustering
    for_clustering = normalized[clustering_columns]
    kmeans = KMeans(init='k-means++', n_clusters=3)
    #Note we must convert to str to join with slope later
    clusters = pd.DataFrame(index = for_clustering.index.astype(str))
    clusters['cluster'] = kmeans.fit_predict(for_clustering)
    print sorted([(metrics.adjusted_mutual_info_score(for_clustering[col], kmeans.labels_), col) \
                  for col in for_clustering.columns])[-5:]

    X = normalized.join(clusters)
    Y = slope.join(clusters)

    best_features_per_cluster = get_best_features_per_cluster(X, Y, all_feature_metadata)
    print "best_features_per_cluster: ", best_features_per_cluster 
    buf = filter_only_selected_features(train_data.set_index("SubjectID"), clusters, \
                                        best_features_per_cluster)
    
    s_df = pd.read_csv(StringIO(buf), sep='|', index_col=False, dtype='unicode')
    s_vectorized, _ = vectorize(s_df, all_feature_metadata)
    s_normalized, _ = normalize(s_vectorized, all_feature_metadata, train_data_means, train_data_std)    
    s_X = s_normalized.join(clusters)
    
    model_per_cluster = get_model_per_cluster(s_X, Y)

    pred = apply_on_test(train_data, all_feature_metadata, train_data_means, train_data_std, 
                 clustering_columns, kmeans, best_features_per_cluster, model_per_cluster)
    res = pred.join(slope)
    print "Train root mean square error (0 is perfect): %.2f" % np.sqrt(np.mean(
        (res.prediction - res.ALSFRS_slope) ** 2))


    fold += 1



fold: 0
train_data:  (819377, 6) 1212 496835 999482
[(-0.0014835768196526402, 'family_ALS_hist_last'), (-0.0013302522215587797, 'bp_systolic_pct_diff'), (-0.00079670466900564154, 'Albumin_pct_diff'), (-0.0007120963070863113, 'weight_mean_slope'), (-0.00062955340654440701, 'Hispanic'), (-0.00059299419413581258, 'pulse_pct_diff'), (-0.00050797223663625495, 'Other'), (-0.00026364906647803865, 'Creatinine_mean_slope'), (-0.00018832878007630941, 'pulse_mean_slope'), (5.2328079158173903e-05, 'weight_pct_diff'), (0.00010021105448683779, 'bp_systolic_mean_slope'), (0.00017118852854769117, 'bp_diastolic_mean_slope'), (0.00037326922657056875, 'fvc_percent_mean_slope'), (0.00047721861423314802, 'fvc_percent_std'), (0.00055517629089476487, 'fvc_percent_pct_diff'), (0.00056530858245989232, 'Unknown'), (0.00060134899130415902, 'onset_delta_last'), (0.00074902435255601448, 'fvc_percent_median'), (0.00074902435255603367, 'fvc_percent_mean'), (0.00085640056266703742, 'BMI_last'), (0.0012477391985584523

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


ValueError: Incorrect number of features. Got 2 features, expected 83