# Distance analysis of the testing data on the training data

In [1]:
import sys
sys.path.append("./src") # append to system path

from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn import cross_validation
from sklearn.externals import joblib

import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.patches import Rectangle
style.use('ggplot')



In [2]:
##Helpers
def mre(true_y,pred_y):
    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.abs(np.subtract(true_y, pred_y) / true_y) * 100

def distance_analysis(distance, error, thre_dist=30):
    #cut off the error by the threshold distance
    outside = []
    inside = []
    for each_dist, each_error in zip(distance, error):
        if each_dist <= thre_dist:
            inside.append(each_error)
        else:
            outside.append(each_error)
    return np.mean(inside),len(inside),np.mean(outside),len(outside)

def run_model(graph_path, model_path, inputs):
    with tf.Session() as sess:
        new_saver = tf.train.import_meta_graph(graph_path)
        new_saver.restore(sess, model_path)
        X = tf.get_collection("X")[0]
        y = tf.get_collection("y")[0]
        pred = tf.get_collection("pred")[0]  
        results = sess.run(pred,feed_dict={X:inputs})
        return results


In [3]:
# load descriptors, they are all same for every model
trn_descs = pd.read_csv('../data/descs/train/descs_Mar08_3839_train.csv',header=0,index_col=None).fillna(0).values
tst_descs = pd.read_csv('../data/descs/test/descs_Mar08_3839_test.csv',header=0,index_col=None).fillna(0).values

---
# Distance Analysis on the CED Model

load CED model and split data to training and validation

In [None]:
CED_graph_path = "../nets/CED/CED_apr4.meta"
CED_model_path = "../nets/CED/CED_apr4" #CED without .ckpt...

# load target
CED_trn_target = pd.read_csv('../data/target/train/CED_train.csv').values
CED_tst_target = pd.read_csv('../data/target/test/CED_test.csv').values

#split training and validation sets 
trn_X, val_X,trn_y, val_y = cross_validation.train_test_split(
    trn_descs, CED_trn_target, test_size=0.1, random_state=42)

## measure distance between centroid to validation, and centroid to test descriptors BEFORE PCA

In [None]:
centroid = np.mean(trn_X, axis=0)

val_distance_to_trn_centroid = []
tst_distance_to_trn_centroid = []

for each_val in val_X:
    e_d_val = np.linalg.norm(each_val-centroid)
    val_distance_to_trn_centroid.append(e_d_val)
    
# measure distance on the original descriptors (before PCA)
for each_tst in tst_descs:
    e_d_tst = np.linalg.norm(each_tst-centroid)
    tst_distance_to_trn_centroid.append(e_d_tst)

## do PCA on training, validation and test descriptors

In [None]:
this_scaler = joblib.load('../nets/CED/scaler.pkl')
pca = joblib.load("../nets/CED/pca.pkl")

trn_X = pca.transform(this_scaler.transform(trn_X))
val_X = pca.transform(this_scaler.transform(val_X))
tst_X = pca.transform(this_scaler.transform(tst_descs))

print trn_X.shape, val_X.shape, tst_X.shape

## Run CED model and give prediction on validation and test sets

In [None]:
pred_val = run_model(CED_graph_path, CED_model_path, val_X)
pred_tst = run_model(CED_graph_path, CED_model_path, tst_X)

### Determine the cut-off threshold of the CED model on the validation set

Doing it manually, for results, see the Excel file in Results folder

In [None]:
val_rel_error = mre(val_y,pred_val)

for each_pair in zip(val_rel_error, val_distance_to_trn_centroid):
    print each_pair
print r2_score(val_rel_error,val_distance_to_trn_centroid) 

In [None]:
inside_mean,num_chem_inside, outside_mean,num_chem_outside = distance_analysis(
    val_distance_to_trn_centroid, val_rel_error,1600)
print inside_mean, num_chem_inside
print outside_mean, num_chem_outside

# CED -- Decide whether each testing chemical in our outside AD using the determined threshold distance

using 1000

In [None]:
tst_rel_error = mre(CED_tst_target, pred_tst)

for each_pair in zip(tst_rel_error, tst_distance_to_trn_centroid):
    print each_pair

In [None]:
test_inside, test_outside = [],[]
for each_test_dist,each_test_error in zip(tst_distance_to_trn_centroid,tst_rel_error):
    if each_test_dist <= 900:
        test_inside.append(each_test_error)
    else:
        test_outside.append(each_test_error)

print np.mean(test_inside)
print np.mean(test_outside)

---
# Distance Analysis on the acidification Model
load acidification model and split data to training and validation

In [4]:
acid_graph_path = "../nets/acidification/acidification_apr4.meta"
acid_model_path = "../nets/acidification/acidification_apr4.ckpt" #CED without .ckpt...

# load target
acid_trn_target = pd.read_csv('../data/target/train/acidification_train.csv').values
acid_tst_target = pd.read_csv('../data/target/test/acidification_test.csv').values

#split training and validation sets 
trn_X, val_X,trn_y, val_y = cross_validation.train_test_split(
    trn_descs, acid_trn_target, test_size=0.1, random_state=3)

## measure distance between centroid to validation, and centroid to test descriptors BEFORE PCA

In [5]:
centroid = np.mean(trn_X, axis=0)

val_distance_to_trn_centroid = []
tst_distance_to_trn_centroid = []

for each_val in val_X:
    e_d_val = np.linalg.norm(each_val-centroid)
    val_distance_to_trn_centroid.append(e_d_val)
    
# measure distance on the original descriptors (before PCA)
for each_tst in tst_descs:
    e_d_tst = np.linalg.norm(each_tst-centroid)
    tst_distance_to_trn_centroid.append(e_d_tst)

## do PCA on training, validation and test descriptors

In [6]:
this_scaler = joblib.load('../nets/acidification/scaler.pkl')
pca = joblib.load("../nets/acidification/pca.pkl")

trn_X = pca.transform(this_scaler.transform(trn_X))
val_X = pca.transform(this_scaler.transform(val_X))
tst_X = pca.transform(this_scaler.transform(tst_descs))

print trn_X.shape, val_X.shape, tst_X.shape

(140, 60) (16, 60) (10, 60)


## Run acid model and give prediction on validation and test sets

In [7]:
pred_val = run_model(acid_graph_path, acid_model_path, val_X)
pred_tst = run_model(acid_graph_path, acid_model_path, tst_X)

### Determine the cut-off threshold of the acidification model on the validation set

Doing it manually, for results, see the Excel file in Results folder

In [9]:
val_rel_error = mre(val_y, pred_val)

for each_pair in zip(val_rel_error, val_distance_to_trn_centroid):
    print each_pair

(array([ 19.5989533]), 726.70978095062276)
(array([ 63.73545822]), 1324.9018379149593)
(array([ 8.205636]), 477.50965293946354)
(array([ 288.01232915]), 4365.3392081366692)
(array([ 109.96836671]), 1583.5360667286047)
(array([ 27.87097497]), 685.49556960005089)
(array([ 15.11944348]), 1212.083866555753)
(array([ 10.74131449]), 603.38853177340934)
(array([ 15.68092157]), 769.14945366495294)
(array([ 80.65550438]), 1551.2710277153853)
(array([ 5.90719386]), 899.58508256123412)
(array([ 119.22190277]), 612.64038846593132)
(array([ 20.05581841]), 1028.9971632341365)
(array([ 4.18117666]), 1265.0349945896066)
(array([ 93.6074054]), 1367.8886445469698)
(array([ 23.70820664]), 821.60955248002597)


In [21]:
inside_mean,num_chem_inside, outside_mean,num_chem_outside = distance_analysis(
    val_distance_to_trn_centroid, val_rel_error,1600)
print inside_mean, num_chem_inside
print outside_mean, num_chem_outside

41.2172184565 15
288.012329151 1


# Acidification -- Decide whether each testing chemical in our outside AD using the determined threshold distance

using 1500

In [22]:
tst_rel_error = mre(acid_tst_target, pred_tst)

for each_pair in zip(tst_rel_error, tst_distance_to_trn_centroid):
    print each_pair

(array([ 2.52661755]), 1832.6740803026159)
(array([ 9.33849845]), 1224.4477800309464)
(array([ 51.93761335]), 7103.7306508290103)
(array([ 33.1718333]), 1490.3011642220563)
(array([ 35.71065878]), 1595.690150212301)
(array([ 6.95598513]), 601.18938340020838)
(array([ 70.12750741]), 568.29763522452765)
(array([ 214.98881457]), 1514.5900195576428)
(array([ 16.06649507]), 664.43627041106902)
(array([ 18.83128971]), 1054.6147050933641)


In [24]:
test_inside, test_outside = [],[]
for each_test_dist,each_test_error in zip(tst_distance_to_trn_centroid,tst_rel_error):
    if each_test_dist <= 1500:
        test_inside.append(each_test_error)
    else:
        test_outside.append(each_test_error)

print np.mean(test_inside)
print np.mean(test_outside)

25.7486015106
76.2909260618


---
# Distance Analysis on the EI99 Model
load acidification model and split data to training and validation

In [8]:
EI99_graph_path = "../nets/EI99/EI99_Apr4.meta"
EI99_model_path = "../nets/EI99/EI99_Apr4.ckpt"

# load target
EI99_trn_target = pd.read_csv('../data/target/train/EI99_train.csv').values
EI99_tst_target = pd.read_csv('../data/target/test/EI99_test.csv').values

#split training and validation sets 
trn_X, val_X,trn_y, val_y = cross_validation.train_test_split(
    trn_descs, EI99_trn_target, test_size=0.1, random_state=3)

## measure distance between centroid to validation, and centroid to test descriptors BEFORE PCA

In [9]:
centroid = np.mean(trn_X, axis=0)

val_distance_to_trn_centroid = []
tst_distance_to_trn_centroid = []

for each_val in val_X:
    e_d_val = np.linalg.norm(each_val-centroid)
    val_distance_to_trn_centroid.append(e_d_val)
    
# measure distance on the original descriptors (before PCA)
for each_tst in tst_descs:
    e_d_tst = np.linalg.norm(each_tst-centroid)
    tst_distance_to_trn_centroid.append(e_d_tst)

## do PCA on training, validation and test descriptors

In [10]:
this_scaler = joblib.load('../nets/EI99/scaler.pkl')
pca = joblib.load("../nets/EI99/pca.pkl")

trn_X = pca.transform(this_scaler.transform(trn_X))
val_X = pca.transform(this_scaler.transform(val_X))
tst_X = pca.transform(this_scaler.transform(tst_descs))

print trn_X.shape, val_X.shape, tst_X.shape

(140, 60) (16, 60) (10, 60)


## Run acid model and give prediction on validation and test sets

In [11]:
pred_val = run_model(EI99_graph_path, EI99_model_path, val_X)
pred_tst = run_model(EI99_graph_path, EI99_model_path, tst_X)

0.720546347293


### Determine the cut-off threshold of the EI99 model on the validation set

Doing it manually, for results, see the Excel file in Results folder

In [12]:
val_rel_error = mre(val_y, pred_val)

for each_pair in zip(val_rel_error, val_distance_to_trn_centroid):
    print each_pair

(array([ 39.31937675]), 726.70978095062276)
(array([ 5.94827298]), 1324.9018379149593)
(array([ 17.6464651]), 477.50965293946354)
(array([ 216.52407808]), 4365.3392081366692)
(array([ 24.46251764]), 1583.5360667286047)
(array([ 90.02882586]), 685.49556960005089)
(array([ 4.76838963]), 1212.083866555753)
(array([ 8.68301616]), 603.38853177340934)
(array([ 32.99216139]), 769.14945366495294)
(array([ 82.49396767]), 1551.2710277153853)
(array([ 27.97094869]), 899.58508256123412)
(array([ 95.01309854]), 612.64038846593132)
(array([ 11.10305727]), 1028.9971632341365)
(array([ 107.90076875]), 1265.0349945896066)
(array([ 7.79411891]), 1367.8886445469698)
(array([ 28.73395298]), 821.60955248002597)


In [23]:
inside_mean,num_chem_inside, outside_mean,num_chem_outside = distance_analysis(
    val_distance_to_trn_centroid, val_rel_error,1500)
print inside_mean, num_chem_inside
print outside_mean, num_chem_outside

36.7617271533 13
107.826854464 3


# EI99 -- Decide whether each testing chemical in our outside AD using the determined threshold distance

using 1400

In [25]:
tst_rel_error = mre(EI99_tst_target, pred_tst)

for each_pair in zip(tst_rel_error, tst_distance_to_trn_centroid):
    print each_pair

(array([ 14.3174893]), 1832.6740803026159)
(array([ 21.5444076]), 1224.4477800309464)
(array([ 39.47579798]), 7103.7306508290103)
(array([ 2.29610592]), 1490.3011642220563)
(array([ 16.45749342]), 1595.690150212301)
(array([ 25.73121992]), 601.18938340020838)
(array([ 62.42594648]), 568.29763522452765)
(array([ 100.82557087]), 1514.5900195576428)
(array([ 5.51424878]), 664.43627041106902)
(array([ 12.27200238]), 1054.6147050933641)


In [26]:
test_inside, test_outside = [],[]
for each_test_dist,each_test_error in zip(tst_distance_to_trn_centroid,tst_rel_error):
    if each_test_dist <= 1500:
        test_inside.append(each_test_error)
    else:
        test_outside.append(each_test_error)

print np.mean(test_inside)
print np.mean(test_outside)

21.6306551814
42.7690878953


---
# Distance Analysis on the Human Health Model
load acidification model and split data to training and validation

In [5]:
humanhealth_graph_path = "../nets/humanhealth/humanhealth_Apr4.meta"
humanhealth_model_path = "../nets/humanhealth/humanhealth_Apr4.ckpt"

# load target
humanhealth_trn_target = pd.read_csv('../data/target/train/humanhealth_train.csv').values
humanhealth_tst_target = pd.read_csv('../data/target/test/humanhealth_test.csv').values

#split training and validation sets 
trn_X, val_X,trn_y, val_y = cross_validation.train_test_split(
    trn_descs, humanhealth_trn_target, test_size=0.1, random_state=42)

## measure distance between centroid to validation, and centroid to test descriptors BEFORE PCA

In [6]:
centroid = np.mean(trn_X, axis=0)

val_distance_to_trn_centroid = []
tst_distance_to_trn_centroid = []

for each_val in val_X:
    e_d_val = np.linalg.norm(each_val-centroid)
    val_distance_to_trn_centroid.append(e_d_val)
    
# measure distance on the original descriptors (before PCA)
for each_tst in tst_descs:
    e_d_tst = np.linalg.norm(each_tst-centroid)
    tst_distance_to_trn_centroid.append(e_d_tst)

## do PCA on training, validation and test descriptors

In [7]:
this_scaler = joblib.load('../nets/EI99/scaler.pkl')
pca = joblib.load("../nets/EI99/pca.pkl")

trn_X = pca.transform(this_scaler.transform(trn_X))
val_X = pca.transform(this_scaler.transform(val_X))
tst_X = pca.transform(this_scaler.transform(tst_descs))

print trn_X.shape, val_X.shape, tst_X.shape

(140, 60) (16, 60) (10, 60)


## Run Human Health model and give prediction on validation and test sets

In [10]:
pred_val = run_model(humanhealth_graph_path, humanhealth_model_path, val_X)
pred_tst = run_model(humanhealth_graph_path, humanhealth_model_path, tst_X)

# transform data back to normal sacle:
pred_val = np.exp(pred_val)
pred_tst = np.exp(pred_tst)

### Determine the cut-off threshold of the human health model on the validation set

Doing it manually, for results, see the Excel file in Results folder

In [11]:
val_rel_error = mre(val_y, pred_val)

for each_pair in zip(val_rel_error, val_distance_to_trn_centroid):
    print each_pair

(array([ 54.90085468]), 500.49378446659694)
(array([ 236.15725248]), 1007.6565158688053)
(array([ 94.03266786]), 803.54587724992814)
(array([ 39.70986334]), 1523.2805615903217)
(array([ 894.18621076]), 1526.9313789007265)
(array([ 16.30203474]), 1293.7364911278191)
(array([ 84.52487735]), 1640.6186956122021)
(array([ 12.3675001]), 1513.0601877194447)
(array([ 27.11026268]), 1429.1311717629812)
(array([ 9.08467394]), 769.50035930307922)
(array([ 368.24616562]), 1461.587980855332)
(array([ 58.99871055]), 1209.708920277471)
(array([ 4.55958751]), 1644.242043471613)
(array([ 25.66386729]), 1353.1055901057618)
(array([ 7.20489522]), 430.86896395280297)
(array([ 35.16468816]), 1508.8376607334717)


In [22]:
inside_mean,num_chem_inside, outside_mean,num_chem_outside = distance_analysis(
    val_distance_to_trn_centroid, val_rel_error,1500)
print inside_mean, num_chem_inside
print outside_mean, num_chem_outside

89.7701385047 10
178.418787871 6


# Human Health -- Decide whether each testing chemical in our outside AD using the determined threshold distance

using 1400

In [23]:
tst_rel_error = mre(humanhealth_tst_target, pred_tst)

for each_pair in zip(tst_rel_error, tst_distance_to_trn_centroid):
    print each_pair

(array([ 56.74637857]), 1777.7454309418476)
(array([ 42.1703915]), 1284.6081249495462)
(array([ 63.44956898]), 7044.373518591985)
(array([ 20.14703816]), 1453.0274553854483)
(array([ 70.25258055]), 1654.9134079225387)
(array([ 22.0393248]), 652.97321034186893)
(array([ 36.52263734]), 548.74924319970376)
(array([ 252.93679828]), 1575.2986302307593)
(array([ 82.31780406]), 720.05778198983103)
(array([ 257.24634309]), 1114.0978590988623)


In [24]:
test_inside, test_outside = [],[]
for each_test_dist,each_test_error in zip(tst_distance_to_trn_centroid,tst_rel_error):
    if each_test_dist <= 1500:
        test_inside.append(each_test_error)
    else:
        test_outside.append(each_test_error)

print np.mean(test_inside)
print np.mean(test_outside)

76.7405898272
110.846331596


---
# Distance Analysis on the Ecosystem quality Model
load acidification model and split data to training and validation

In [5]:
ecosystemquality_graph_path = "../nets/ecosystemquality/ecosystemquality_Apr4.meta"
ecosystemquality_model_path = "../nets/ecosystemquality/ecosystemquality_Apr4.ckpt"

# load target
ecosystemquality_trn_target = pd.read_csv('../data/target/train/ecosystemquality_train.csv').values
ecosystemquality_tst_target = pd.read_csv('../data/target/test/ecosystemquality_test.csv').values

#split training and validation sets 
trn_X, val_X,trn_y, val_y = cross_validation.train_test_split(
    trn_descs, ecosystemquality_trn_target, test_size=0.1, random_state=3)

## measure distance between centroid to validation, and centroid to test descriptors BEFORE PCA

In [6]:
centroid = np.mean(trn_X, axis=0)

val_distance_to_trn_centroid = []
tst_distance_to_trn_centroid = []

for each_val in val_X:
    e_d_val = np.linalg.norm(each_val-centroid)
    val_distance_to_trn_centroid.append(e_d_val)
    
# measure distance on the original descriptors (before PCA)
for each_tst in tst_descs:
    e_d_tst = np.linalg.norm(each_tst-centroid)
    tst_distance_to_trn_centroid.append(e_d_tst)

## do PCA on training, validation and test descriptors

In [7]:
this_scaler = joblib.load('../nets/ecosystemquality/scaler.pkl')
pca = joblib.load("../nets/ecosystemquality/pca.pkl")

trn_X = pca.transform(this_scaler.transform(trn_X))
val_X = pca.transform(this_scaler.transform(val_X))
tst_X = pca.transform(this_scaler.transform(tst_descs))

print trn_X.shape, val_X.shape, tst_X.shape

(140, 60) (16, 60) (10, 60)


## Run Ecosystem Quality model and give prediction on validation and test sets

In [8]:
pred_val = run_model(ecosystemquality_graph_path, ecosystemquality_model_path, val_X)
pred_tst = run_model(ecosystemquality_graph_path, ecosystemquality_model_path, tst_X)

# transform data back to normal sacle:
pred_val = np.exp(pred_val)
pred_tst = np.exp(pred_tst)

### Determine the cut-off threshold of the ecosystem quality model on the validation set

Doing it manually, for results, see the Excel file in Results folder

In [10]:
val_rel_error = mre(val_y, pred_val)

for each_pair in zip(val_rel_error, val_distance_to_trn_centroid):
    print each_pair

(array([ 51.00001803]), 726.70978095062276)
(array([ 59.73979106]), 1324.9018379149593)
(array([ 58.2637163]), 477.50965293946354)
(array([ 152.96300507]), 4365.3392081366692)
(array([ 68.1045611]), 1583.5360667286047)
(array([ 99.27665262]), 685.49556960005089)
(array([ 20.73529172]), 1212.083866555753)
(array([ 54.06152218]), 603.38853177340934)
(array([ 9.33593674]), 769.14945366495294)
(array([ 90.93853063]), 1551.2710277153853)
(array([ 10.2897959]), 899.58508256123412)
(array([ 80.89269989]), 612.64038846593132)
(array([ 26.6960862]), 1028.9971632341365)
(array([ 40.7644544]), 1265.0349945896066)
(array([ 21.29626287]), 1367.8886445469698)
(array([ 0.38759519]), 821.60955248002597)


In [21]:
inside_mean,num_chem_inside, outside_mean,num_chem_outside = distance_analysis(
    val_distance_to_trn_centroid, val_rel_error,1500)
print inside_mean, num_chem_inside
print outside_mean, num_chem_outside

40.979986391 13
104.002032267 3


# Ecosystem Quality -- Decide whether each testing chemical in our outside AD using the determined threshold distance

using 1400

In [23]:
tst_rel_error = mre(ecosystemquality_tst_target, pred_tst)

for each_pair in zip(tst_rel_error, tst_distance_to_trn_centroid):
    print each_pair

(array([ 71.52836218]), 1832.6740803026159)
(array([ 61.31988215]), 1224.4477800309464)
(array([ 38.9898816]), 7103.7306508290103)
(array([ 30.8281118]), 1490.3011642220563)
(array([ 31.92400957]), 1595.690150212301)
(array([ 46.95400279]), 601.18938340020838)
(array([ 214.36169472]), 568.29763522452765)
(array([ 109.04752822]), 1514.5900195576428)
(array([ 33.79156011]), 664.43627041106902)
(array([ 15.85673358]), 1054.6147050933641)


In [25]:
test_inside, test_outside = [],[]
for each_test_dist,each_test_error in zip(tst_distance_to_trn_centroid,tst_rel_error):
    if each_test_dist <= 1500:
        test_inside.append(each_test_error)
    else:
        test_outside.append(each_test_error)

print np.mean(test_inside)
print np.mean(test_outside)

67.1853308578
62.8724453917


---
# Distance Analysis on the GWP Model
load acidification model and split data to training and validation

In [4]:
GWP_graph_path = "../nets/GWP/GWP_Apr4.meta"
GWP_model_path = "../nets/GWP/GWP_Apr4.ckpt"

# load target
GWP_trn_target = pd.read_csv('../data/target/train/GWP_train.csv').values
GWP_tst_target = pd.read_csv('../data/target/test/GWP_test.csv').values

#split training and validation sets 
trn_X, val_X,trn_y, val_y = cross_validation.train_test_split(
    trn_descs, GWP_trn_target, test_size=0.1, random_state=42)

## measure distance between centroid to validation, and centroid to test descriptors BEFORE PCA

In [5]:
centroid = np.mean(trn_X, axis=0)

val_distance_to_trn_centroid = []
tst_distance_to_trn_centroid = []

for each_val in val_X:
    e_d_val = np.linalg.norm(each_val-centroid)
    val_distance_to_trn_centroid.append(e_d_val)
    
# measure distance on the original descriptors (before PCA)
for each_tst in tst_descs:
    e_d_tst = np.linalg.norm(each_tst-centroid)
    tst_distance_to_trn_centroid.append(e_d_tst)

## do PCA on training, validation and test descriptors

In [6]:
this_scaler = joblib.load('../nets/GWP/scaler.pkl')
pca = joblib.load("../nets/GWP/pca.pkl")

trn_X = pca.transform(this_scaler.transform(trn_X))
val_X = pca.transform(this_scaler.transform(val_X))
tst_X = pca.transform(this_scaler.transform(tst_descs))

print trn_X.shape, val_X.shape, tst_X.shape

(140, 60) (16, 60) (10, 60)


## Run GWP model and give prediction on validation and test sets

In [7]:
pred_val = run_model(GWP_graph_path, GWP_model_path, val_X)
pred_tst = run_model(GWP_graph_path, GWP_model_path, tst_X)

# transform data back to normal sacle:
pred_val = np.exp(pred_val)
pred_tst = np.exp(pred_tst)

### Determine the cut-off threshold of the ecosystem quality model on the validation set

Doing it manually, for results, see the Excel file in Results folder

In [8]:
val_rel_error = mre(val_y, pred_val)

for each_pair in zip(val_rel_error, val_distance_to_trn_centroid):
    print each_pair

(array([ 29.56346684]), 500.49378446659694)
(array([ 90.24584083]), 1007.6565158688053)
(array([ 34.40953342]), 803.54587724992814)
(array([ 11.77165456]), 1523.2805615903217)
(array([ 90.01564917]), 1526.9313789007265)
(array([ 80.23150503]), 1293.7364911278191)
(array([ 9.97891534]), 1640.6186956122021)
(array([ 23.61797926]), 1513.0601877194447)
(array([ 14.47116852]), 1429.1311717629812)
(array([ 549.75203444]), 769.50035930307922)
(array([ 198.74415461]), 1461.587980855332)
(array([ 127.78869327]), 1209.708920277471)
(array([ 47.85836834]), 1644.242043471613)
(array([ 43.89947414]), 1353.1055901057618)
(array([ 20.53744193]), 430.86896395280297)
(array([ 37.25826598]), 1508.8376607334717)


In [19]:
inside_mean,num_chem_inside, outside_mean,num_chem_outside = distance_analysis(
    val_distance_to_trn_centroid, val_rel_error,1500)
print inside_mean, num_chem_inside
print outside_mean, num_chem_outside

118.964331302 10
36.7501387738 6


# GWP -- Decide whether each testing chemical in our outside AD using the determined threshold distance

using 700

In [20]:
tst_rel_error = mre(GWP_tst_target, pred_tst)

for each_pair in zip(tst_rel_error, tst_distance_to_trn_centroid):
    print each_pair

(array([ 3.0354557]), 1777.7454309418476)
(array([ 80.46356816]), 1284.6081249495462)
(array([ 10.27296328]), 7044.373518591985)
(array([ 117.36490571]), 1453.0274553854483)
(array([ 27.50327838]), 1654.9134079225387)
(array([ 40.66743847]), 652.97321034186893)
(array([ 89.36864459]), 548.74924319970376)
(array([ 90.35288471]), 1575.2986302307593)
(array([ 25.14356308]), 720.05778198983103)
(array([ 17.06993854]), 1114.0978590988623)


In [26]:
test_inside, test_outside = [],[]
for each_test_dist,each_test_error in zip(tst_distance_to_trn_centroid,tst_rel_error):
    if each_test_dist <= 700:
        test_inside.append(each_test_error)
    else:
        test_outside.append(each_test_error)

print np.mean(test_inside)
print np.mean(test_outside)

65.01804153
46.4008196956
