In [None]:
# LTN: https://github.com/logictensornetworks/logictensornetworks/blob/master/examples/multiclass_classification/multiclass-singlelabel.ipynb
# Common.py : https://github.com/logictensornetworks/logictensornetworks/raw/master/examples/multiclass_classification/commons.py

In [None]:
!pip install PyTDC rdkit-pypi ltn keras==2.15.0 -qq
!wget https://github.com/logictensornetworks/logictensornetworks/raw/master/examples/multiclass_classification/commons.py

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/142.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m122.9/142.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.9/142.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import logging; logging.basicConfig(level=logging.INFO)
import tensorflow as tf
import pandas as pd
import numpy as np
import ltn
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm.auto import tqdm
tqdm.pandas()

# Utils Functions

In [None]:
# Threshold ref: https://pubs.acs.org/doi/epdf/10.1021/acs.jcim.3c01301
def label_th(pic50):
    classes = []
    for x in pic50:
        if x>=5:
            classes.append(1)
        else:
            classes.append(0)

    return np.asarray(classes)
class_map = {
    "blocks":1,
    "non-blocks":0,
}

### Data Acquisition

In [None]:
dataset_path = "/content/drive/MyDrive/Project/AI and Cardiology/Cardiotoxicity/Dataset"

In [None]:
!ls "{dataset_path}/UniChemDB-Data"

CDK-unichemdb.csv  final-herg.csv  final-herg-split.csv  mmb_embeddings.npy  Morgan-unichemdb.csv


In [None]:
df = pd.read_csv(f"{dataset_path}/UniChemDB-Data/final-herg-split.csv")
df.dropna(subset = ['std_smiles'],inplace = True)
df.reset_index(drop = True,inplace = True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20409 entries, 0 to 20408
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                20388 non-null  object
 1   std_smiles        20409 non-null  object
 2   classes           20409 non-null  int64 
 3   train_test_split  20409 non-null  int64 
 4   cv_fold           20409 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 797.4+ KB


# Data



In [None]:
#External Test-1: https://github.com/Abdulk084/CardioTox/blob/master/data/external_test_set_pos.csv
test_pos_df = pd.read_csv(f"{dataset_path}/External-Data/external_test_set_pos.csv")


In [None]:
# External Test h70, h60 dataset: https://github.com/issararab/CToxPred/tree/main/data/raw/hERG
test_h60_df = pd.read_csv(f"{dataset_path}/External-Data/eval_set_herg_60.csv")
test_h70_df = pd.read_csv(f"{dataset_path}/External-Data/eval_set_herg_70.csv")


In [None]:
test_h60_df.head()

Unnamed: 0,InChl Key,SMILES,Source,pIC50
0,LIHJHFVXLZSRNK-UHFFFAOYSA-N,Cn1ccc(C[N+]2=CC(c3cccc(C(F)(F)F)c3)C=N2)n1,US Patent,5.647817
1,RXGDDWPITVSKDR-UHFFFAOYSA-N,CC(C)(C)OC(=O)N1CCN(c2nc3c([N+](=O)[O-])c(Br)c...,US Patent,5.60206
2,YRSBMPKJFDYYFO-UHFFFAOYSA-N,Fc1cccc(Oc2cc(C(F)(F)F)nc(N3CCc4nc[nH]c4C3)n2)c1,US Patent,5.59998
3,OMQQLDITRYIEHZ-UHFFFAOYSA-N,Cn1nccc1Cc1cn(-c2ccc(F)c(Cl)c2)nn1,US Patent,5.364516
4,BXBUTKPGTGJGTQ-YOEHRIQHSA-N,CNC[C@@H](c1ccc(Cl)c(Cl)c1)[C@@H](OC)c1cccc(NS...,US Patent,5.327902


# Utils Functions

In [None]:
#Threshold conversion
test_h60_df['target'] = label_th(test_h60_df.pIC50)
test_h70_df['target'] = label_th(test_h70_df.pIC50)

In [None]:
!ls "{dataset_path}/UniChemDB-Data"

CDK-unichemdb.csv  final-herg.csv  final-herg-split.csv  mmb_embeddings.npy  Morgan-unichemdb.csv


In [None]:
mmb_data = np.load(dataset_path+f"/UniChemDB-Data/mmb_embeddings.npy",allow_pickle = True)

In [None]:
df.head()

Unnamed: 0,id,std_smiles,classes,train_test_split,cv_fold
0,CHEMBL240,O=C1NCCN1CCN1CCC(c2cn(-c3ccc(F)cc3)c3ccc(Cl)cc...,1,0,9
1,CHEMBL240,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...,1,0,8
2,CHEMBL240,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,1,0,0
3,CHEMBL240,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,1,0,5
4,CHEMBL240,CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c2cc(C(F)(...,1,0,0


In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X = ss.fit_transform(mmb_data)
y = df['classes']


In [None]:
X.shape,y.shape

((20409, 512), (20409,))

In [None]:
!ls '{dataset_path}/External-Data'

CDK-external_test_set_pos.csv  eval_set_herg_70.csv	       herg_mmb_emb_h70.npz
CDK-herg60.csv		       external_test_set_pos.csv       Morgan-external_test_set_pos.csv
CDK-herg70.csv		       herg_mmb_emb_external_test.npz  Morgan-herg60.csv
eval_set_herg_60.csv	       herg_mmb_emb_h60.npz	       Morgan-herg70.csv


In [None]:
ext_pos_df = pd.DataFrame.from_records(np.load(f'{dataset_path}/External-Data/herg_mmb_emb_external_test.npz',allow_pickle = True)['external_test_set_pos'].tolist()).rename(columns = {'Y':"target"})
ext_h60_df = pd.DataFrame.from_records(np.load(f'{dataset_path}/External-Data/herg_mmb_emb_h60.npz',allow_pickle = True)['herg_mmb_emb_h60'].tolist())
ext_h70_df = pd.DataFrame.from_records(np.load(f'{dataset_path}/External-Data/herg_mmb_emb_h70.npz',allow_pickle = True)['herg_mmb_emb_h70'].tolist())

In [None]:
batch_size = 64
ds_train = tf.data.Dataset.from_tensor_slices((X,y)).batch(batch_size)
idx = np.random.random_integers(0,len(X),1000)
ds_test = tf.data.Dataset.from_tensor_slices((X[idx],y[idx])).batch(batch_size)

  idx = np.random.random_integers(0,len(X),1000)


# LTN

Predicate with softmax `P(x,class)`

In [None]:
class MLP(tf.keras.Model):
    """Model that returns logits."""
    def __init__(self, n_classes, hidden_layer_sizes=(16,16,8)):
        super(MLP, self).__init__()
        self.denses = [tf.keras.layers.Dense(s, activation="elu") for s in hidden_layer_sizes]
        self.dense_class = tf.keras.layers.Dense(n_classes)
        self.dropout = tf.keras.layers.Dropout(0.2)

    def call(self, inputs, training=False):
        x = inputs[0]
        for dense in self.denses:
            x = dense(x)
            x = self.dropout(x, training=training)
        return self.dense_class(x)

logits_model = MLP(2)
p = ltn.Predicate.FromLogits(logits_model, activation_function="softmax", with_class_indexing=True)

Constants to index/iterate on the classes

In [None]:
class_A = ltn.Constant(0, trainable=False)
class_B = ltn.Constant(1, trainable=False)
# class_C = ltn.Constant(2, trainable=False)

Operators and axioms

In [None]:
Not = ltn.Wrapper_Connective(ltn.fuzzy_ops.Not_Std())
And = ltn.Wrapper_Connective(ltn.fuzzy_ops.And_Prod())
Or = ltn.Wrapper_Connective(ltn.fuzzy_ops.Or_ProbSum())
Implies = ltn.Wrapper_Connective(ltn.fuzzy_ops.Implies_Reichenbach())
Forall = ltn.Wrapper_Quantifier(ltn.fuzzy_ops.Aggreg_pMeanError(p=2),semantics="forall")

In [None]:
formula_aggregator = ltn.Wrapper_Formula_Aggregator(ltn.fuzzy_ops.Aggreg_pMeanError(p=2))

@tf.function
def axioms(features, labels, training=False):
    x_A = ltn.Variable("x_A",features[labels==0])
    x_B = ltn.Variable("x_B",features[labels==1])
    # x_C = ltn.Variable("x_C",features[labels==2])
    axioms = [
        Forall(x_A,p([x_A,class_A],training=training)),
        Forall(x_B,p([x_B,class_B],training=training)),
        # Forall(x_C,p([x_C,class_C],training=training))
    ]
    for i in range(len(axioms)):
        if tf.math.is_nan(axioms[i].tensor):
            axioms[i].tensor  =0.0
    sat_level = formula_aggregator(axioms).tensor
    return sat_level

Initialize all layers and the static graph

In [None]:
for features, labels in ds_test:
    print("Initial sat level %.5f"%axioms(features,labels))
    break

Initial sat level 0.42852


# Training

Define the metrics. While training, we measure:
1. The level of satisfiability of the Knowledge Base of the training data.
1. The level of satisfiability of the Knowledge Base of the test data.
3. The training accuracy.
4. The test accuracy.

In [None]:
metrics_dict = {
    'train_sat_kb': tf.keras.metrics.Mean(name='train_sat_kb'),
    'test_sat_kb': tf.keras.metrics.Mean(name='test_sat_kb'),
    'train_accuracy': tf.keras.metrics.CategoricalAccuracy(name="train_accuracy"),
    'test_accuracy': tf.keras.metrics.CategoricalAccuracy(name="test_accuracy")
}

Define the training and test step

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
@tf.function
def train_step(features, labels):
    # sat and update
    with tf.GradientTape() as tape:
        sat = axioms(features, labels, training=True)
        loss = 1.-sat
    gradients = tape.gradient(loss, p.trainable_variables)
    optimizer.apply_gradients(zip(gradients, p.trainable_variables))
    sat = axioms(features, labels) # compute sat without dropout
    metrics_dict['train_sat_kb'](sat)
    # accuracy
    predictions = logits_model([features])
    metrics_dict['train_accuracy'](tf.one_hot(labels,2),predictions)

@tf.function
def test_step(features, labels):
    # sat
    sat = axioms(features, labels)
    metrics_dict['test_sat_kb'](sat)
    # accuracy
    predictions = logits_model([features])
    metrics_dict['test_accuracy'](tf.one_hot(labels,2),predictions)

Train

In [None]:
import commons

EPOCHS = 500

commons.train(
    EPOCHS,
    metrics_dict,
    ds_train,
    ds_test,
    train_step,
    test_step,
    csv_path="herg_MMB_results.csv",
    track_metrics=20
)

Epoch 0, train_sat_kb: 0.4826, test_sat_kb: 0.5476, train_accuracy: 0.6389, test_accuracy: 0.7010
Epoch 20, train_sat_kb: 0.5850, test_sat_kb: 0.6120, train_accuracy: 0.7972, test_accuracy: 0.7880
Epoch 40, train_sat_kb: 0.6160, test_sat_kb: 0.6482, train_accuracy: 0.8326, test_accuracy: 0.8430
Epoch 60, train_sat_kb: 0.6354, test_sat_kb: 0.6582, train_accuracy: 0.8520, test_accuracy: 0.8420
Epoch 80, train_sat_kb: 0.6498, test_sat_kb: 0.6806, train_accuracy: 0.8655, test_accuracy: 0.8620
Epoch 100, train_sat_kb: 0.6639, test_sat_kb: 0.6945, train_accuracy: 0.8785, test_accuracy: 0.8760
Epoch 120, train_sat_kb: 0.6724, test_sat_kb: 0.7023, train_accuracy: 0.8841, test_accuracy: 0.8830
Epoch 140, train_sat_kb: 0.6804, test_sat_kb: 0.7082, train_accuracy: 0.8927, test_accuracy: 0.8800
Epoch 160, train_sat_kb: 0.6858, test_sat_kb: 0.7150, train_accuracy: 0.8956, test_accuracy: 0.8890
Epoch 180, train_sat_kb: 0.6918, test_sat_kb: 0.7270, train_accuracy: 0.9023, test_accuracy: 0.9040
Epoch 

In [None]:
!ls "{dataset_path}/../Model-Weights"

hERG-Karim-CDK.keras  hERG-Karim-Morgan_CDK.keras  hERG-UniChemDB-CDK.keras
hERG-Karim-MMB.keras  hERG-Karim-Morgan.keras	   hERG-UniChemDB-Morgan.keras


In [None]:
logits_model.save(f"{dataset_path}/../Model-Weights/hERG-UniChemDB-MMB.keras")

## Model Evaluation

In [None]:
from sklearn.metrics import (
    accuracy_score as ays,
    f1_score as fs,
    precision_score as ps,
    recall_score as rs,
    matthews_corrcoef as mcc,
    roc_auc_score as auc,
    balanced_accuracy_score,
    confusion_matrix
)

In [None]:
def print_score(xtest,ytest,name):

    pred_test = logits_model.predict([xtest]).argmax(-1)

    auc_test = auc(ytest, pred_test)


    tn, fp, fn, tp = confusion_matrix(ytest, pred_test).ravel()

    specificity_test = tn / (tn + fp)

    sensitivity_test = tp / (tp + fn)

    NPV_test = tn / (tn + fn)

    PPV_test = tp / (tp + fp)
    Accuracy_test = ays(ytest, pred_test)
    Balanced_Accuracy_test = balanced_accuracy_score(ytest, pred_test)

    MCC_test= mcc(ytest, pred_test)


    print(f"MCC_test_{name}: " + str(MCC_test))
    print(f"NPV_test_{name}g: " + str(NPV_test))
    print(f"Accuracy_test_{name}: " + str(Accuracy_test))
    print(f"PPV_test_{name}: " + str(PPV_test))
    print(f"specificity_test_{name}: " + str(specificity_test))
    print(f"sensitivity_test_{name}: " + str(sensitivity_test))
    print(f"Balanced_Accuracy_test{name}: " + str(Balanced_Accuracy_test))


In [None]:
print_score(ss.transform(np.vstack(ext_pos_df['emb'].values)),ext_pos_df['ACTIVITY'],'External Data Test-1 (pos)')


MCC_test_External Data Test-1 (pos): 0.6850937041446569
NPV_test_External Data Test-1 (pos)g: 0.6842105263157895
Accuracy_test_External Data Test-1 (pos): 0.8409090909090909
PPV_test_External Data Test-1 (pos): 0.96
specificity_test_External Data Test-1 (pos): 0.9285714285714286
sensitivity_test_External Data Test-1 (pos): 0.8
Balanced_Accuracy_testExternal Data Test-1 (pos): 0.8642857142857143


In [None]:
print_score(ss.transform(np.vstack(ext_h60_df['emb'].values)),(ext_h60_df.pIC50 >=5).astype(int),'External hERG-60')


MCC_test_External hERG-60: 0.5971865794514643
NPV_test_External hERG-60g: 0.7278481012658228
Accuracy_test_External hERG-60: 0.788
PPV_test_External hERG-60: 0.8913043478260869
specificity_test_External hERG-60: 0.92
sensitivity_test_External hERG-60: 0.656
Balanced_Accuracy_testExternal hERG-60: 0.788


In [None]:
print_score(ss.transform(np.vstack(ext_h70_df['emb'].values)),(ext_h70_df.pIC50 >=5).astype(int),'External hERG-70')


MCC_test_External hERG-70: 0.6416732589925616
NPV_test_External hERG-70g: 0.7479674796747967
Accuracy_test_External hERG-70: 0.8160676532769556
PPV_test_External hERG-70: 0.8898678414096917
specificity_test_External hERG-70: 0.8803827751196173
sensitivity_test_External hERG-70: 0.7651515151515151
Balanced_Accuracy_testExternal hERG-70: 0.8227671451355663
