## Cross-validation for number of hidden neuron and hidden layers

In [11]:
import sys
sys.path.append("./src") # append to system path

from sklearn import cross_validation
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.patches import Rectangle
style.use('ggplot')

In [12]:
def load_lcia_data(descs_p, target_p):
    X = pd.read_csv(descs_p,header=0,index_col=None)
    X = X.fillna(0)
    y = pd.read_csv(target_p,header=0,index_col=None)
    return X.values,y.values

def mre(true_y,pred_y):
    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((true_y - pred_y) / true_y)) * 100

### Helper functions

In [35]:
def fit_descs(trn_X,val_X):
    #fit descriptors using standard scaler and PCA
    this_scaler = StandardScaler()
    pca = PCA(n_components = 40)
    
    trn_X = this_scaler.fit_transform(trn_X)
    trn_X = pca.fit_transform(trn_X)
    
    val_X = this_scaler.transform(val_X)
    val_X = pca.transform(val_X)
    return trn_X,val_X

def init_weights(shape):
    weights = tf.random_normal(shape,stddev = 0.1)
    return tf.Variable(weights)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def build_model_and_train(trn_X, trn_y, val_X,val_y,num_epoch,num_hidden_neuron, lr=0.01,beta=0.01,verbose=True):
    # high level function to create NN model
    
    #Input and Output Dim
    num_descs = trn_X.shape[1]
    num_target = trn_y.shape[1]
    
    #Placeholders
    X = tf.placeholder(tf.float32,shape=[None,num_descs])
    y = tf.placeholder(tf.float32,shape=[None,num_target])
    
    # First layer
    w1 = init_weights((num_descs,num_hidden_neuron)) 
    b1 = bias_variable([num_hidden_neuron])
    l1 = tf.add(tf.matmul(X,w1),b1)
    l1 = tf.nn.sigmoid(l1)
    
    # Output Layer
    w_out = init_weights((num_hidden_neuron,num_target))
    b_out = bias_variable([num_target])
    l_out = tf.matmul(l1,w_out) + b_out #no nonlinarity
    
    #Prediction
    pred = l_out

    #Define Cost Function 
    regularizers = tf.nn.l2_loss(w1) + tf.nn.l2_loss(w_out)
    cost = tf.reduce_mean(tf.square(pred - y) + beta*regularizers)

    #Gridient Descent Optimizer
    optimizer = tf.train.AdagradOptimizer(learning_rate = lr).minimize(cost)

    #init session
    init = tf.global_variables_initializer()
    costs = []
    
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(num_epoch):
            for i in range(0, len(trn_X),1):
                _, c = sess.run([optimizer,cost], feed_dict={X:trn_X[i:i+1], y:trn_y[i:i+1]})

            trn_score = r2_score(trn_y,sess.run(pred, feed_dict={X:trn_X, y:trn_y}))
            val_score = r2_score(val_y,sess.run(pred, feed_dict={X:val_X, y:val_y}))
            val_mre = mre(val_y,sess.run(pred,feed_dict={X:val_X,y:val_y}))

            costs.append(val_score)
            if epoch % 20 == 0 and verbose==True:
                print("Epoch = %d,Cost = %.2f,Training Accuracy = %.2f, Validation Accuracy = %.2f, Validation MRE =%.2f" % (epoch + 1,c,trn_score,val_score,val_mre))

        # final pred on the validation set
        final_pred_val = sess.run(pred,feed_dict={X:val_X})
    return final_pred_val


### For Human Health Model -- Single Layer

### Load Data, no fitting yet

In [25]:
descs_p = '../data/descs/train/descs_Mar08_3839_train.csv'
target_p = '../data/target/train/humanhealth_train.csv'
X_raw,y_raw = load_lcia_data(descs_p, target_p)

kf = KFold(n_splits=5,random_state=1)
print kf

KFold(n_splits=5, random_state=1, shuffle=False)


In [26]:
# use this to spilt CV dataset
count = 0
all_r2 = []
for trn_idx,val_idx in kf.split(X_raw,y_raw):
    count += 1
    trn_X = X_raw[trn_idx]
    val_X = X_raw[val_idx]

    trn_y = y_raw[trn_idx]
    val_y = y_raw[val_idx]
    
    # Fit descs
    trn_X,val_X = fit_descs(trn_X,val_X)
    # log target
    trn_y = np.log(trn_y)
    val_y = np.log(val_y)
    
    print "Training on Fold ",count
    this_pred = build_model_and_train(trn_X,trn_y,val_X,val_y,
                   num_epoch=500,
                   num_hidden_neuron=512,
                   lr=0.01,
                   beta=0.01,
                   verbose=False)
    this_r2 = r2_score(np.exp(val_y),np.exp(this_pred))
    print this_r2
    all_r2.append(this_r2)

Training on Fold  1
-5.15788718493
Training on Fold  2
-24.7718533955
Training on Fold  3
-9.70725587753
Training on Fold  4
0.509883471332
Training on Fold  5
-0.294592798606


In [27]:
print all_r2
print np.mean(all_r2)

[-5.1578871849268477, -24.771853395545072, -9.7072558775348252, 0.50988347133162637, -0.29459279860647625]
-7.88434115706


### For CED Model, Single Layer

### Load Data

In [33]:
descs_p = '../data/descs/train/descs_Mar08_3839_train.csv'
target_p = '../data/target/train/humanhealth_train.csv'
X_raw,y_raw = load_lcia_data(descs_p, target_p)

kf = KFold(n_splits=5,random_state=1)
print kf

KFold(n_splits=5, random_state=1, shuffle=False)


In [34]:
# use this to spilt CV dataset
count = 0
all_r2 = []
for trn_idx,val_idx in kf.split(X_raw,y_raw):
    count += 1
    trn_X = X_raw[trn_idx]
    val_X = X_raw[val_idx]

    trn_y = y_raw[trn_idx]
    val_y = y_raw[val_idx]
    
    # Fit descs
    trn_X,val_X = fit_descs(trn_X,val_X)

    print "Training on Fold ",count
    this_pred = build_model_and_train(trn_X,trn_y,val_X,val_y,
                   num_epoch=500,
                   num_hidden_neuron=16,
                   lr=0.01,
                   beta=0.01,
                   verbose=True)
    this_r2 = r2_score(val_y,this_pred)
    print this_r2
    all_r2.append(this_r2)

Training on Fold  1
Epoch = 1,Cost = 0.03,Training Accuracy = -20315.93, Validation Accuracy = -45372.21, Validation MRE =18580.70
Epoch = 6,Cost = 0.02,Training Accuracy = -2791.55, Validation Accuracy = -7430.37, Validation MRE =6813.49
Epoch = 11,Cost = 0.01,Training Accuracy = -1213.73, Validation Accuracy = -4573.80, Validation MRE =4360.39
Epoch = 16,Cost = 0.01,Training Accuracy = -705.51, Validation Accuracy = -3411.16, Validation MRE =3256.25
Epoch = 21,Cost = 0.01,Training Accuracy = -459.36, Validation Accuracy = -2626.00, Validation MRE =2581.95
Epoch = 26,Cost = 0.01,Training Accuracy = -314.30, Validation Accuracy = -1994.49, Validation MRE =2119.22
Epoch = 31,Cost = 0.00,Training Accuracy = -212.03, Validation Accuracy = -1447.05, Validation MRE =1769.11
Epoch = 36,Cost = 0.00,Training Accuracy = -122.41, Validation Accuracy = -963.79, Validation MRE =1495.19
Epoch = 41,Cost = 0.00,Training Accuracy = -65.78, Validation Accuracy = -630.89, Validation MRE =1253.22
Epoch =

KeyboardInterrupt: 

In [None]:
print all_r2
print np.mean(all_r2)