# Distance analysis of the testing data on the training data

In [1]:
import sys
sys.path.append("./src") # append to system path

from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn import cross_validation
from sklearn.externals import joblib

import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.patches import Rectangle
style.use('ggplot')



In [2]:
##Helpers
def mre(true_y,pred_y):
    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.abs(np.subtract(true_y, pred_y) / true_y) * 100

def distance_analysis(distance, error, thre_dist=30):
    #cut off the error by the threshold distance
    outside = []
    inside = []
    for each_dist, each_error in zip(distance, error):
        if each_dist <= thre_dist:
            inside.append(each_error)
        else:
            outside.append(each_error)
    return np.mean(inside),len(inside),np.mean(outside),len(outside)

def run_model(graph_path, model_path, inputs):
    with tf.Session() as sess:
        new_saver = tf.train.import_meta_graph(graph_path)
        new_saver.restore(sess, model_path)
        X = tf.get_collection("X")[0]
        y = tf.get_collection("y")[0]
        pred = tf.get_collection("pred")[0]  
        results = sess.run(pred,feed_dict={X:inputs})
        return results


In [4]:
# load descriptors, they are all same for every model
trn_descs = pd.read_csv('../data/descs/train/descs_Mar08_3839_train.csv',header=0,index_col=None).fillna(0).values
tst_descs = pd.read_csv('../data/descs/test/descs_Mar08_3839_test.csv',header=0,index_col=None).fillna(0).values

---
# Distance Analysis on the CED Model

load CED model and split data to training and validation

In [5]:
CED_graph_path = "../nets/CED/CED_apr4.meta"
CED_model_path = "../nets/CED/CED_apr4" #CED without .ckpt...

# load target
CED_trn_target = pd.read_csv('../data/target/train/CED_train.csv').values
CED_tst_target = pd.read_csv('../data/target/test/CED_test.csv').values

#split training and validation sets 
trn_X, val_X,trn_y, val_y = cross_validation.train_test_split(
    trn_descs, CED_trn_target, test_size=0.1, random_state=42)

## measure distance between centroid to validation, and centroid to test descriptors BEFORE PCA

In [6]:
centroid = np.mean(trn_X, axis=0)

val_distance_to_trn_centroid = []
tst_distance_to_trn_centroid = []

for each_val in val_X:
    e_d_val = np.linalg.norm(each_val-centroid)
    val_distance_to_trn_centroid.append(e_d_val)
    
# measure distance on the original descriptors (before PCA)
for each_tst in tst_descs:
    e_d_tst = np.linalg.norm(each_tst-centroid)
    tst_distance_to_trn_centroid.append(e_d_tst)

## do PCA on training, validation and test descriptors

In [7]:
this_scaler = joblib.load('../nets/CED/scaler.pkl')
pca = joblib.load("../nets/CED/pca.pkl")

trn_X = pca.transform(this_scaler.transform(trn_X))
val_X = pca.transform(this_scaler.transform(val_X))
tst_X = pca.transform(this_scaler.transform(tst_descs))

print trn_X.shape, val_X.shape, tst_X.shape

(140, 60) (16, 60) (10, 60)


## Run CED model and give prediction on validation and test sets

In [8]:
pred_val = run_model(CED_graph_path, CED_model_path, val_X)
pred_tst = run_model(CED_graph_path, CED_model_path, tst_X)

### Determine the cut-off threshold of the CED model on the validation set

Doing it manually, for results, see the Excel file in Results folder

In [None]:
val_rel_error = mre(val_y,pred_val)

for each_pair in zip(val_rel_error, val_distance_to_trn_centroid):
    print each_pair
print r2_score(val_rel_error,val_distance_to_trn_centroid) 

In [None]:
inside_mean,num_chem_inside, outside_mean,num_chem_outside = distance_analysis(
    val_distance_to_trn_centroid, val_rel_error,1600)
print inside_mean, num_chem_inside
print outside_mean, num_chem_outside

# CED -- Decide whether each testing chemical in our outside AD using the determined threshold distance

using 1000

In [11]:
tst_rel_error = mre(CED_tst_target, pred_tst)

for each_pair in zip(tst_rel_error, tst_distance_to_trn_centroid):
    print each_pair

(array([ 9.65234429]), 1777.7454309418476)
(array([ 41.92993966]), 1284.6081249495462)
(array([ 48.73085337]), 7044.373518591985)
(array([ 43.64604134]), 1453.0274553854483)
(array([ 37.30225129]), 1654.9134079225387)
(array([ 37.54851941]), 652.97321034186893)
(array([ 56.75443195]), 548.74924319970376)
(array([ 76.72156908]), 1575.2986302307593)
(array([ 42.02973612]), 720.05778198983103)
(array([ 22.10796921]), 1114.0978590988623)


In [14]:
test_inside, test_outside = [],[]
for each_test_dist,each_test_error in zip(tst_distance_to_trn_centroid,tst_rel_error):
    if each_test_dist <= 900:
        test_inside.append(each_test_error)
    else:
        test_outside.append(each_test_error)

print np.mean(test_inside)
print np.mean(test_outside)

45.4442291585
40.012995462


---