## Cross-validation for number of hidden neuron and hidden layers -- Two Layers

In [6]:
import sys
sys.path.append("./src") # append to system path

from sklearn import cross_validation
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.patches import Rectangle
style.use('ggplot')

In [7]:
def load_lcia_data(descs_p, target_p):
    X = pd.read_csv(descs_p,header=0,index_col=None)
    X = X.fillna(0)
    y = pd.read_csv(target_p,header=0,index_col=None)
    return X.values,y.values

def mre(true_y,pred_y):
    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((true_y - pred_y) / true_y)) * 100

### Helper functions

In [81]:
def fit_descs(trn_X,val_X):
    #fit descriptors using standard scaler and PCA
    this_scaler = StandardScaler()
    pca = PCA(n_components = 60)
    
    trn_X = this_scaler.fit_transform(trn_X)
    trn_X = pca.fit_transform(trn_X)
    
    val_X = this_scaler.transform(val_X)
    val_X = pca.transform(val_X)
    return trn_X,val_X

def init_weights(shape):
    weights = tf.random_normal(shape,stddev = 0.1)
    return tf.Variable(weights)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def build_model_and_train(trn_X, trn_y, val_X,val_y,num_epoch,num_hidden_neuron, lr=0.01,beta=0.01,verbose=True):
    # high level function to create NN model
    
    #Input and Output Dim
    num_descs = trn_X.shape[1]
    num_target = trn_y.shape[1]
    
    #Placeholders
    X = tf.placeholder(tf.float32,shape=[None,num_descs])
    y = tf.placeholder(tf.float32,shape=[None,num_target])
    
    # First layer
    w1 = init_weights((num_descs,num_hidden_neuron)) 
    b1 = bias_variable([num_hidden_neuron])
    l1 = tf.add(tf.matmul(X,w1),b1)
    l1 = tf.nn.sigmoid(l1)
    
    # Second layer
    w2 = init_weights((num_hidden_neuron,num_hidden_neuron)) 
    b2 = bias_variable([num_hidden_neuron])
    l2 = tf.add(tf.matmul(l1,w2),b2)
    l2 = tf.nn.sigmoid(l2)
    
    # Output Layer
    w_out = init_weights((num_hidden_neuron,num_target))
    b_out = bias_variable([num_target])
    l_out = tf.matmul(l2,w_out) + b_out #no nonlinarity
    
    #Prediction
    pred = l_out

    #Define Cost Function 
    regularizers = tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) + tf.nn.l2_loss(w_out)
    cost = tf.reduce_mean(tf.square(pred - y) + beta*regularizers)

    #Gridient Descent Optimizer
    optimizer = tf.train.AdagradOptimizer(learning_rate = lr).minimize(cost)

    #init session
    init = tf.global_variables_initializer()
    costs = []
    
    # Start training
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(num_epoch):
            for i in range(0, len(trn_X),1):
                _, c = sess.run([optimizer,cost], feed_dict={X:trn_X[i:i+1], y:trn_y[i:i+1]})

            trn_score = r2_score(trn_y,sess.run(pred, feed_dict={X:trn_X, y:trn_y}))
            val_score = r2_score(val_y,sess.run(pred, feed_dict={X:val_X, y:val_y}))
            val_mre = mre(val_y,sess.run(pred,feed_dict={X:val_X,y:val_y}))

            costs.append(val_score)
            if epoch % 1 == 0 and verbose==True:
                print("Epoch = %d,Cost = %.2f,Training Accuracy = %.2f, Validation Accuracy = %.2f, Validation MRE =%.2f" % (epoch + 1,c,trn_score,val_score,val_mre))

        # final pred on the validation set
        final_pred_val = sess.run(pred,feed_dict={X:val_X})
        
    return final_pred_val


### Two Layers for CED 

### Load Training data
use Kfold to create cross-validation dataset

In [29]:
descs_p = '../data/descs/train/descs_Mar08_3839_train.csv'
target_p = '../data/target/train/CED_train.csv'
X_raw,y_raw = load_lcia_data(descs_p, target_p)

random_state = [3,6]

In [32]:
# use this to spilt CV dataset
count = 0
all_r2 = []
for each_state in random_state:
    count += 1
    trn_X,val_X,trn_y,val_y = cross_validation.train_test_split(X_raw,y_raw,test_size=0.1,random_state=each_state)
    
    #Fit data
    trn_X,val_X = fit_descs(trn_X,val_X)
    
    print "Training on Fold ",count,"Random State: ",each_state
    this_pred = build_model_and_train(trn_X,trn_y,val_X,val_y,
                   num_epoch=700,
                   num_hidden_neuron=512,
                   lr=0.01,
                   beta=0.001,
                   verbose=True)
    this_r2 = r2_score(val_y, this_pred)
    print this_r2
    all_r2.append(this_r2)

Training on Fold  1 Random State:  3
Epoch = 1,Cost = 3920.65,Training Accuracy = -0.36, Validation Accuracy = -0.27, Validation MRE =28.02
Epoch = 51,Cost = 3.59,Training Accuracy = 0.88, Validation Accuracy = 0.29, Validation MRE =28.99
Epoch = 101,Cost = 2.44,Training Accuracy = 0.94, Validation Accuracy = 0.27, Validation MRE =29.39
Epoch = 151,Cost = 1.81,Training Accuracy = 0.96, Validation Accuracy = 0.27, Validation MRE =29.13
Epoch = 201,Cost = 1.73,Training Accuracy = 0.97, Validation Accuracy = 0.30, Validation MRE =28.68
Epoch = 251,Cost = 1.70,Training Accuracy = 0.98, Validation Accuracy = 0.29, Validation MRE =28.79
Epoch = 301,Cost = 1.68,Training Accuracy = 0.98, Validation Accuracy = 0.28, Validation MRE =28.95
Epoch = 351,Cost = 1.66,Training Accuracy = 0.98, Validation Accuracy = 0.28, Validation MRE =28.99
Epoch = 401,Cost = 1.64,Training Accuracy = 0.99, Validation Accuracy = 0.27, Validation MRE =29.00
Epoch = 451,Cost = 1.63,Training Accuracy = 0.99, Validation 

In [33]:
print all_r2
print np.mean(all_r2)

[0.23669189159824111, 0.38626311001517966]
0.311477500807


### For the Acidification Model

In [44]:
descs_p = '../data/descs/train/descs_Mar08_3839_train.csv'
target_p = '../data/target/train/acidification_train.csv'
X_raw,y_raw = load_lcia_data(descs_p, target_p)

random_state = [3,6]

In [45]:
# use this to spilt CV dataset
count = 0
all_r2 = []
for each_state in random_state:
    count += 1
    trn_X,val_X,trn_y,val_y = cross_validation.train_test_split(X_raw,y_raw,test_size=0.1,random_state=each_state)
    
    #Fit data
    trn_X,val_X = fit_descs(trn_X,val_X)
    
    print "Training on Fold ",count,"Random State: ",each_state
    this_pred = build_model_and_train(trn_X,trn_y,val_X,val_y,
                   num_epoch=700,
                   num_hidden_neuron=16,
                   lr=0.01,
                   beta=0.001,
                   verbose=True)
    this_r2 = r2_score(val_y, this_pred)
    print this_r2
    all_r2.append(this_r2)

Training on Fold  1 Random State:  3
Epoch = 1,Cost = 0.01,Training Accuracy = 0.00, Validation Accuracy = 0.00, Validation MRE =68.72
Epoch = 51,Cost = 0.03,Training Accuracy = 0.51, Validation Accuracy = 0.37, Validation MRE =69.56
Epoch = 101,Cost = 0.06,Training Accuracy = 0.70, Validation Accuracy = 0.52, Validation MRE =69.74
Epoch = 151,Cost = 0.06,Training Accuracy = 0.81, Validation Accuracy = 0.58, Validation MRE =72.36
Epoch = 201,Cost = 0.05,Training Accuracy = 0.86, Validation Accuracy = 0.62, Validation MRE =71.54
Epoch = 251,Cost = 0.04,Training Accuracy = 0.89, Validation Accuracy = 0.63, Validation MRE =70.36
Epoch = 301,Cost = 0.03,Training Accuracy = 0.91, Validation Accuracy = 0.63, Validation MRE =70.07
Epoch = 351,Cost = 0.03,Training Accuracy = 0.93, Validation Accuracy = 0.63, Validation MRE =70.29
Epoch = 401,Cost = 0.03,Training Accuracy = 0.94, Validation Accuracy = 0.63, Validation MRE =70.62
Epoch = 451,Cost = 0.03,Training Accuracy = 0.94, Validation Accur

In [46]:
print all_r2
print np.mean(all_r2)

[0.62325526230218231, 0.4616879262732746]
0.542471594288


### For GWP -- Two Layers

In [73]:
descs_p = '../data/descs/train/descs_Mar08_3839_train.csv'
target_p = '../data/target/train/GWP_train.csv'
X_raw,y_raw = load_lcia_data(descs_p, target_p)

random_state = [1,6]

In [74]:
# use this to spilt CV dataset
count = 0
all_r2 = []
for each_state in random_state:
    count += 1
    trn_X,val_X,trn_y,val_y = cross_validation.train_test_split(X_raw,y_raw,test_size=0.1,random_state=each_state)
    
    #Fit data
    trn_X,val_X = fit_descs(trn_X,val_X)
    #Log scale of target
    trn_y,val_y = np.log(trn_y),np.log(val_y)
    
    print "Training on Fold ",count,"Random State: ",each_state
    this_pred = build_model_and_train(trn_X,trn_y,val_X,val_y,
                   num_epoch=500,
                   num_hidden_neuron=512,
                   lr=0.01,
                   beta=0.01,
                   verbose=True)
    this_r2 = r2_score(np.exp(val_y), np.exp(this_pred))
    print this_r2
    all_r2.append(this_r2)

Training on Fold  1 Random State:  1
Epoch = 1,Cost = 18.15,Training Accuracy = -2.31, Validation Accuracy = -1.99, Validation MRE =542.28
Epoch = 51,Cost = 2.32,Training Accuracy = 0.71, Validation Accuracy = -0.35, Validation MRE =168.39
Epoch = 101,Cost = 0.91,Training Accuracy = 0.66, Validation Accuracy = -0.39, Validation MRE =198.19
Epoch = 151,Cost = 0.53,Training Accuracy = 0.70, Validation Accuracy = -0.38, Validation MRE =203.18
Epoch = 201,Cost = 0.39,Training Accuracy = 0.73, Validation Accuracy = -0.36, Validation MRE =205.95
Epoch = 251,Cost = 0.31,Training Accuracy = 0.77, Validation Accuracy = -0.34, Validation MRE =204.34
Epoch = 301,Cost = 0.28,Training Accuracy = 0.76, Validation Accuracy = -0.33, Validation MRE =205.38
Epoch = 351,Cost = 0.25,Training Accuracy = 0.79, Validation Accuracy = -0.31, Validation MRE =202.27
Epoch = 401,Cost = 0.24,Training Accuracy = 0.79, Validation Accuracy = -0.31, Validation MRE =201.84
Epoch = 451,Cost = 0.23,Training Accuracy = 0.

In [None]:
print all_r2
print np.mean(all_r2)

### EI99 Model -- Two Layers

In [66]:
descs_p = '../data/descs/train/descs_Mar08_3839_train.csv'
target_p = '../data/target/train/EI99_train.csv'
X_raw,y_raw = load_lcia_data(descs_p, target_p)

random_state = [3,6]

In [71]:
# use this to spilt CV dataset
count = 0
all_r2 = []
for each_state in random_state:
    count += 1
    trn_X,val_X,trn_y,val_y = cross_validation.train_test_split(X_raw,y_raw,test_size=0.1,random_state=each_state)
    
    #Fit data
    trn_X,val_X = fit_descs(trn_X,val_X)

    
    print "Training on Fold ",count,"Random State: ",each_state
    this_pred = build_model_and_train(trn_X,trn_y,val_X,val_y,
                   num_epoch=500,
                   num_hidden_neuron=64,
                   lr=0.01,
                   beta=0.01,
                   verbose=True)
    this_r2 = r2_score(val_y, this_pred)
    print this_r2
    all_r2.append(this_r2)

Training on Fold  1 Random State:  3
Epoch = 1,Cost = 0.37,Training Accuracy = 0.07, Validation Accuracy = -0.03, Validation MRE =80.29
Epoch = 51,Cost = 0.06,Training Accuracy = 0.77, Validation Accuracy = 0.68, Validation MRE =58.76
Epoch = 101,Cost = 0.05,Training Accuracy = 0.80, Validation Accuracy = 0.63, Validation MRE =60.02
Epoch = 151,Cost = 0.04,Training Accuracy = 0.82, Validation Accuracy = 0.60, Validation MRE =59.85
Epoch = 201,Cost = 0.04,Training Accuracy = 0.83, Validation Accuracy = 0.59, Validation MRE =59.48
Epoch = 251,Cost = 0.04,Training Accuracy = 0.83, Validation Accuracy = 0.58, Validation MRE =59.10
Epoch = 301,Cost = 0.04,Training Accuracy = 0.84, Validation Accuracy = 0.57, Validation MRE =58.79
Epoch = 351,Cost = 0.04,Training Accuracy = 0.84, Validation Accuracy = 0.56, Validation MRE =58.50
Epoch = 401,Cost = 0.04,Training Accuracy = 0.85, Validation Accuracy = 0.56, Validation MRE =58.21
Epoch = 451,Cost = 0.04,Training Accuracy = 0.85, Validation Accu

In [76]:
print all_r2
print np.mean(all_r2)

[-4.4205131915302545, -0.024317593483247801]
-2.22241539251


### Ecosystem Quality Model

In [82]:
descs_p = '../data/descs/train/descs_Mar08_3839_train.csv'
target_p = '../data/target/train/ecosystemquality_train.csv'
X_raw,y_raw = load_lcia_data(descs_p, target_p)

#use the log scale of the data
y_raw = np.log(y_raw)

random_state = [3,6]

In [84]:
# use this to spilt CV dataset
count = 0
all_r2 = []
for each_state in random_state:
    count += 1
    trn_X,val_X,trn_y,val_y = cross_validation.train_test_split(X_raw,y_raw,test_size=0.1,random_state=each_state)
    
    #Fit data
    trn_X,val_X = fit_descs(trn_X,val_X)

    print "Training on Fold ",count,"Random State: ",each_state
    this_pred = build_model_and_train(trn_X,trn_y,val_X,val_y,
                   num_epoch=200,
                   num_hidden_neuron=512,
                   lr=0.01,
                   beta=0.01,
                   verbose=True)
    
    this_r2 = r2_score(np.exp(val_y), np.exp(this_pred))
    print this_r2
    all_r2.append(this_r2)

Training on Fold  1 Random State:  3
Epoch = 1,Cost = 13.94,Training Accuracy = 0.34, Validation Accuracy = -0.07, Validation MRE =6.40
Epoch = 2,Cost = 13.06,Training Accuracy = 0.44, Validation Accuracy = -0.00, Validation MRE =5.92
Epoch = 3,Cost = 12.31,Training Accuracy = 0.47, Validation Accuracy = 0.01, Validation MRE =5.90
Epoch = 4,Cost = 11.63,Training Accuracy = 0.51, Validation Accuracy = 0.01, Validation MRE =5.88
Epoch = 5,Cost = 11.01,Training Accuracy = 0.54, Validation Accuracy = 0.02, Validation MRE =5.92
Epoch = 6,Cost = 10.43,Training Accuracy = 0.57, Validation Accuracy = 0.01, Validation MRE =5.97
Epoch = 7,Cost = 9.88,Training Accuracy = 0.59, Validation Accuracy = 0.01, Validation MRE =6.03
Epoch = 8,Cost = 9.38,Training Accuracy = 0.61, Validation Accuracy = 0.00, Validation MRE =6.12
Epoch = 9,Cost = 8.91,Training Accuracy = 0.64, Validation Accuracy = -0.00, Validation MRE =6.19
Epoch = 10,Cost = 8.47,Training Accuracy = 0.66, Validation Accuracy = -0.01, Val

In [None]:
print all_r2
print np.mean(all_r2)

### Human Health Model

In [85]:
descs_p = '../data/descs/train/descs_Mar08_3839_train.csv'
target_p = '../data/target/train/humanhealth_train.csv'
X_raw,y_raw = load_lcia_data(descs_p, target_p)

#use the log scale of the data
y_raw = np.log(y_raw)

random_state = [3,6]

In [86]:
# use this to spilt CV dataset
count = 0
all_r2 = []
for each_state in random_state:
    count += 1
    trn_X,val_X,trn_y,val_y = cross_validation.train_test_split(X_raw,y_raw,test_size=0.1,random_state=each_state)
    
    #Fit data
    trn_X,val_X = fit_descs(trn_X,val_X)

    print "Training on Fold ",count,"Random State: ",each_state
    this_pred = build_model_and_train(trn_X,trn_y,val_X,val_y,
                   num_epoch=200,
                   num_hidden_neuron=512,
                   lr=0.01,
                   beta=0.01,
                   verbose=True)
    
    this_r2 = r2_score(np.exp(val_y), np.exp(this_pred))
    print this_r2
    all_r2.append(this_r2)

Training on Fold  1 Random State:  3
Epoch = 1,Cost = 13.80,Training Accuracy = 0.10, Validation Accuracy = -0.80, Validation MRE =10.61
Epoch = 2,Cost = 12.97,Training Accuracy = 0.29, Validation Accuracy = -0.39, Validation MRE =8.85
Epoch = 3,Cost = 12.23,Training Accuracy = 0.34, Validation Accuracy = -0.26, Validation MRE =7.89
Epoch = 4,Cost = 11.56,Training Accuracy = 0.38, Validation Accuracy = -0.20, Validation MRE =7.44
Epoch = 5,Cost = 10.93,Training Accuracy = 0.42, Validation Accuracy = -0.17, Validation MRE =7.21
Epoch = 6,Cost = 10.33,Training Accuracy = 0.45, Validation Accuracy = -0.15, Validation MRE =7.09
Epoch = 7,Cost = 9.78,Training Accuracy = 0.48, Validation Accuracy = -0.14, Validation MRE =7.01
Epoch = 8,Cost = 9.26,Training Accuracy = 0.50, Validation Accuracy = -0.14, Validation MRE =6.95
Epoch = 9,Cost = 8.77,Training Accuracy = 0.52, Validation Accuracy = -0.14, Validation MRE =6.91
Epoch = 10,Cost = 8.31,Training Accuracy = 0.53, Validation Accuracy = -0.