In [5]:
import numpy as np
import pickle
import pandas as pd
import os
from os.path import join
import warnings
warnings.filterwarnings("ignore")
from scipy import stats
from scipy.stats import wilcoxon

datasets_dir = "../../data"

## Loading training and test data:

In [6]:
split = "secondary"

data_train = pd.read_pickle(join(datasets_dir, "splits", split, "training_data.pkl"))
data_test = pd.read_pickle(join(datasets_dir, "splits", split, "test_data.pkl"))
data_val = pd.read_pickle(join(datasets_dir, "splits", split, "val_data.pkl"))

# data_train["geomean_kcat"] = np.log10(data_train["geomean_kcat"])
# data_test["geomean_kcat"] = np.log10(data_test["geomean_kcat"])

data_train["log10_kcat"] = np.log10(data_train["kcat"])
data_test["log10_kcat"] = np.log10(data_test["kcat"])
data_val["log10_kcat"] = np.log10(data_val["kcat"])

data_train.rename(columns = {"Enzyme rep" : "ESM2"}, inplace = True)
data_test.rename(columns = {"Enzyme rep" : "ESM2"}, inplace = True)
data_val.rename(columns = {"Enzyme rep" : "ESM2"}, inplace = True)

data_train['Temperature'] = data_train['Temperature'].replace('-', np.nan)
data_test['Temperature'] = data_test['Temperature'].replace('-', np.nan)
data_val['Temperature'] = data_val['Temperature'].replace('-', np.nan)
data_train['pH'] = data_train['pH'].replace('-', np.nan)
data_test['pH'] = data_test['pH'].replace('-', np.nan)
data_val['pH'] = data_val['pH'].replace('-', np.nan)
data_train['Type'] = data_train['Type'].replace('wildtype', 1)
data_train['Type'] = data_train['Type'].replace('mutant', 2)
data_test['Type'] = data_test['Type'].replace('wildtype', 1)
data_test['Type'] = data_test['Type'].replace('mutant', 2)
data_val['Type'] = data_val['Type'].replace('wildtype', 1)
data_val['Type'] = data_val['Type'].replace('mutant', 2)

data_train['MACCS FP'] = data_train['MACCS FP'].astype(str)
data_test['MACCS FP'] = data_test['MACCS FP'].astype(str)
data_val['MACCS FP'] = data_val['MACCS FP'].astype(str)

len(data_train), len(data_test), len(data_val)

(105, 36, 7)

In [7]:
train_indices = list(np.load(join(datasets_dir, "splits", split, "CV_train_indices_Seed plants.npy"), allow_pickle = True))
test_indices = list(np.load(join(datasets_dir, "splits", split, "CV_test_indices_Seed plants.npy"), allow_pickle = True))

In [8]:
data_test = data_test[~data_test['GNN FP'].isnull()]

nan_rows = data_train[data_train['GNN FP'].apply(lambda x: not isinstance(x, np.ndarray))]

# Get the indices of these rows
indices_with_nan = nan_rows.index.tolist()
# indices_with_nan.reverse()
print(indices_with_nan)

for ind, sub_list in enumerate(train_indices):
    for elem in sub_list:
        if elem in indices_with_nan:
            sub_list.remove(elem)

for ind, sub_list in enumerate(train_indices):
    for num in indices_with_nan:
        for i, elem in enumerate(sub_list):
            if elem > num:
                train_indices[ind][i] = elem-1

for ind, sub_list in enumerate(test_indices):
    for elem in sub_list:
        if elem in indices_with_nan:
            sub_list.remove(elem)

for ind, sub_list in enumerate(test_indices):
    for num in indices_with_nan:
        for i, elem in enumerate(sub_list):
            if elem > num:
                test_indices[ind][i] = elem-1  


data_train = data_train[data_train['GNN FP'].apply(lambda x: isinstance(x, np.ndarray))]
data_train.reset_index(inplace=True, drop=True)

[]


In [18]:
pred_y = np.load(join("..", "..", "data", split, "y_test_pred_xgboost_ESM2_gnn_fp.npy"))
test_y = np.load(join("..", "..", "data", split, "y_test_true_xgboost_ESM2_gnn_fp.npy"))
esm2_gnn_fp = abs(10**pred_y-10**test_y)

pred_y = np.load(join("..", "..", "data", split, "y_test_pred_xgboost_ESM2_diff_fp.npy"))
test_y = np.load(join("..", "..", "data", split, "y_test_true_xgboost_ESM2_diff_fp.npy"))
esm2_diff_fp = abs(10**pred_y-10**test_y)

pred_y = np.load(join("..", "..", "data", split, "y_test_pred_xgboost_ESM2_gnn_fp_diff_fp.npy"))
test_y = np.load(join("..", "..", "data", split, "y_test_true_xgboost_ESM2_gnn_fp_diff_fp.npy"))
esm2_gnn_fp_diff_fp = abs(10**pred_y-10**test_y)

pred_y = np.load(join("..", "..", "data", split, "y_test_pred_xgboost_ESM2.npy"))
test_y = np.load(join("..", "..", "data", split, "y_test_true_xgboost_ESM2.npy"))
esm2 = abs(10**pred_y-10**test_y)

pred_y = np.load(join("..", "..", "data", split, "y_test_pred_xgboost_gnn_fp.npy"))
test_y = np.load(join("..", "..", "data", split, "y_test_true_xgboost_gnn_fp.npy"))
gnn_fp = abs(10**pred_y-10**test_y)

pred_y = np.load(join("..", "..", "data", split, "y_test_pred_xgboost_diff_fp.npy"))
test_y = np.load(join("..", "..", "data", split, "y_test_true_xgboost_diff_fp.npy"))
diff_fp = abs(10**pred_y-10**test_y)

pred_y = np.load(join("..", "..", "data", split, "y_test_pred_xgboost_diff_fp_gnn_fp.npy"))
test_y = np.load(join("..", "..", "data", split, "y_test_true_xgboost_diff_fp_gnn_fp.npy"))
diff_fp_gnn_fp = abs(10**pred_y-10**test_y)

In [27]:
d = esm2_diff_fp - esm2_gnn_fp
w, p = wilcoxon(d, alternative='less')
print("Difference between predictions with enzyme+reaction and enzyme+substrate", p)

d = esm2_diff_fp - esm2_gnn_fp_diff_fp
w, p = wilcoxon(d, alternative='less')
print("Difference between predictions with enzyme+reaction and enzyme+substrate+reaction", p)

d = esm2_diff_fp - esm2
w, p = wilcoxon(d, alternative='less')
print("Difference between predictions with enzyme+reaction and enzyme", p)

d = esm2_diff_fp - gnn_fp
w, p = wilcoxon(d, alternative='less')
print("Difference between predictions with enzyme+reaction and substrate", p)

d = esm2_diff_fp - diff_fp
w, p = wilcoxon(d, alternative='less')
print("Difference between predictions with enzyme+reaction and reaction", p)

d = esm2_diff_fp - diff_fp_gnn_fp
w, p = wilcoxon(d, alternative='less')
print("Difference between predictions with enzyme+reaction and reaction+substrate", p)



Difference between predictions with enzyme+reaction and enzyme+substrate 0.53125
Difference between predictions with enzyme+reaction and enzyme+substrate+reaction 0.2890625 10.0
Difference between predictions with enzyme+reaction and enzyme 0.2890625 10.0
Difference between predictions with enzyme+reaction and substrate 0.7109375
Difference between predictions with enzyme+reaction and reaction 0.1875
Difference between predictions with enzyme+reaction and reaction+substrate 0.65625


In [30]:
pred_y = np.load(join("..", "..", "data", split, "y_test_pred_xgboost_baseline.npy"))
test_y = np.load(join("..", "..", "data", split, "y_test_true_xgboost_diff_fp.npy"))
baseline = abs(10**pred_y-10**test_y)

d = esm2_diff_fp - baseline
w, p = wilcoxon(d, alternative='less')
print("Difference between predictions with enzyme+reaction and baseline", p)


Difference between predictions with enzyme+reaction and baseline 0.0390625
