In [None]:
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from rdkit.Chem import MACCSkeys, rdFingerprintGenerator
from rdkit import DataStructs
from wrapMordred import mordredWrapper

import chemprop

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import joblib

import matplotlib.pyplot as plt

In [None]:
np.random.seed(1234)

In [None]:
# endpoint = 'skin-sensitization'
endpoint = 'eye-irritation'

loc = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\data'
endpoint_loc = os.path.join(loc, endpoint)
model = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\src\models'
model_loc = os.path.join(model, endpoint)

In [None]:
filename = 'train.csv'
df_train = pd.read_csv(os.path.join(endpoint_loc, filename))

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
filename = 'val.csv'
df_val = pd.read_csv(os.path.join(endpoint_loc, filename))

In [None]:
df_val.shape

In [None]:
df_val.head()

In [None]:
train_smiles = df_train['SMILES'].to_numpy()
train_labels = df_train['Activity'].to_numpy()

val_smiles = df_val['SMILES'].to_numpy()
val_labels = df_val['Activity'].to_numpy()

In [None]:
print('val size smiles :', val_smiles.shape)
print('val size labels :', val_labels.shape)
print('pos samples in val size :', val_labels[val_labels == 1].shape)
print('neg samples in val size :', val_labels[val_labels == 0].shape)

In [None]:
def get_MAACS(smiles_array, labels):
    fps = []
    y = []
    for smiles, label in zip(smiles_array, labels):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            pass
        else:
            fps.append(np.array(MACCSkeys.GenMACCSKeys(mol)))
            y.append(label)

    assert len(fps) == len(y)
    
    return np.array(fps), np.array(y)

In [None]:
def get_Morgen(smiles_array, labels):
    fpg = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=2048)
    fps = []
    y = []
    for smiles, label in zip(smiles_array, labels):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            pass
        else:
            fps.append(np.array(fpg.GetFingerprint(mol)))
            y.append(label)

    assert len(fps) == len(y)
    
    return np.array(fps), np.array(y)

In [None]:
mordred = mordredWrapper(np.concatenate((train_smiles, val_smiles)))

In [None]:
input_dict = {}
input_dict['MAACS'] = {}
input_dict['Morgen'] = {}
input_dict['Mordred'] = {}

In [None]:
input_dict['MAACS']['fingerprints'], input_dict['MAACS']['labels'] = get_MAACS(val_smiles, val_labels)

In [None]:
input_dict['Morgen']['fingerprints'], input_dict['Morgen']['labels'] = get_Morgen(val_smiles, val_labels)

In [None]:
input_dict['Mordred']['fingerprints'], input_dict['Mordred']['labels'] = mordred.get_fingerprints(val_smiles, val_labels)

In [None]:
for key in input_dict:
    print(f'{key} fingerprint stats')
    print('val size fingerprints :', input_dict[key]['fingerprints'].shape)
    print('val size labels :', input_dict[key]['labels'].shape)
    print('pos samples in val size :', input_dict[key]['labels'][input_dict[key]['labels'] == 1].shape)
    print('neg samples in val size :', input_dict[key]['labels'][input_dict[key]['labels'] == 0].shape)

In [None]:
models = {}
for m in ['rf', 'svm']:
    for key in input_dict:
        model_key = m + '-' + key
        model_name = model_key + '.joblib'
        models[model_key] = joblib.load(os.path.join(model_loc, model_name))

In [None]:
models

In [None]:
def get_MPNN_pred(endpoint_loc, model_loc, val_labels, filename='val.csv'):
    arguments = [
        '--test_path', os.path.join(endpoint_loc, filename), 
        '--preds_path', '/dev/null',
        '--checkpoint_dir', model_loc,
        '--smiles_columns', 'SMILES',
        '--features_generator', 'rdkit_2d_normalized', 
        '--no_features_scaling'
    ]

    args = chemprop.args.PredictArgs().parse_args(arguments)
    preds = chemprop.train.make_predictions(args=args)

    y_pred = (np.array(preds).flatten()[np.where(np.array(preds).flatten() != 'Invalid SMILES')].astype(np.float32) > 0.5).astype(np.int64)
    y_true = val_labels[np.where(np.array(preds).flatten() != 'Invalid SMILES')]

    return y_pred, y_true

In [None]:
y_pred_MPNN, y_true_MPNN = get_MPNN_pred(endpoint_loc, model_loc, val_labels, 'val.csv')

In [None]:
plt.figure()
measurement = {}
measurement['ACC'] = []
measurement['SEN'] = []
measurement['SPE'] = []
xlabels = []
for model_key in models:
    key = model_key.strip().split('-')[1]

    y_pred = models[model_key].predict(input_dict[key]['fingerprints'])
    y_true = input_dict[key]['labels']

    tn, fp, fn, tp = confusion_matrix(y_pred, y_true).ravel()

    ACC = (tp + tn)/(tp + tn + fn + fp)
    SEN = tp/(tp + fn)
    SPE = tn/(tn + fp)

    xlabels.append(model_key)
    measurement['ACC'].append(ACC)
    measurement['SEN'].append(SEN)
    measurement['SPE'].append(SPE)


tn, fp, fn, tp = confusion_matrix(y_pred_MPNN, y_true_MPNN).ravel()

ACC = (tp + tn)/(tp + tn + fn + fp)
SEN = tp/(tp + fn)
SPE = tn/(tn + fp)

xlabels.append('MPNN')
measurement['ACC'].append(ACC)
measurement['SEN'].append(SEN)
measurement['SPE'].append(SPE)


x = np.arange(len(xlabels))  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout='constrained')

colours = ['blue', 'red', 'green']
for key, value in measurement.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, value, width, label=key, color=colours[multiplier])
    multiplier += 1

ax.set_title('Whole Validation set')
ax.set_xticks(x + width, xlabels, rotation= 45, ha= 'right')
ax.set_yticks(np.arange(0,11)/10)
ax.legend(loc='upper right', ncols=3)
ax.set_ylim(0, 1)
ax.set_axisbelow(True)
ax.grid(axis='y')

plt.show()