In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import _pickle as cPickle
import gzip
import joblib

from collections import Counter

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.model_selection import permutation_test_score, StratifiedKFold

def warn(*args, **kwargs):
    pass
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn
from tensorflow.keras.models import load_model
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%reload_ext autoreload
%autoreload 2
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

2023-09-12 21:15:38.480177: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-09-12 21:15:38.480199: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.kernel.execute("set_timeout(60000)")'))

<IPython.core.display.Javascript object>

# Selecione o Descritor

In [3]:
#descritores = "morgan"
descritores = "sirms"
#descritores = "padel"
#descritores = "rdkit"

In [4]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [5]:
def variable_adjustments(desc):
    desc.drop(desc.columns[0:1], axis=1,inplace=True)
    ##### Remover variáveis constantes e quase constantes
    desc = desc.select_dtypes(exclude=['object'])
    desc = desc.dropna(axis=1, how='any')
    desc = desc.fillna(0)

    desc.fillna(desc.mean())

    ##### Remover variáveis correlacionadas
    correlated_features = set()  
    correlation_matrix = desc.corr()

    for i in range(len(correlation_matrix.columns)):  
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > 0.9:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)

    desc.drop(labels=correlated_features, axis=1, inplace=True)
    return desc

In [6]:
from sklearn import metrics

def stats(y_test, y_pred):
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred, labels=[0,1])
    Kappa = metrics.cohen_kappa_score(y_test, y_pred, weights='linear')
    # Valores verdadeiros e falsos
    TN, FP, FN, TP = confusion_matrix.ravel()
    # Accuracy
    AC = (TP+TN)/(TP+FP+FN+TN)
    # Sensibilidade, taxa de acerto, recall ou taxa positiva verdadeira
    SE = TP/(TP+FN)
    # Especificidade ou taxa negativa verdadeira
    SP = TN/(TN+FP)
    # Precisão ou valor preditivo positivo
    PPV = TP/(TP+FP)
    # Valor preditivo negativo
    NPV = TN/(TN+FN)
    # Taxa de classificação correta
    CCR = (SE + SP)/2   
    # F1 Score
    F1_score = 2*(PPV*SE)/(PPV+SE)
    d = dict({'Kappa': Kappa,
         'AUC': CCR,
         'Sensibilidade': SE,
         'PPV': PPV,
         'Especificidade': SP,
         'NPV': NPV,
         'Acurácia': AC,
         'F1 Score':F1_score})
    return pd.DataFrame(d, columns=d.keys(), index=[0]).round(2)

In [7]:
def predictions(model, X_vs):
    ad_threshold = 0.70

    y_pred = model.predict(X_vs)
    confidence = model.predict_proba(X_vs)
    confidence = np.amax(confidence, axis=1).round(2)
    ad = confidence >= ad_threshold

    pred = pd.DataFrame({'Prediction': y_pred, 'AD': ad, 'Confidence': confidence}, index=None)
    pred.AD[pred.AD == False] = np.nan
    pred.AD[pred.AD == True] = pred.Prediction.astype(int)
    return pred

In [8]:
def predictions_tf(model, X_vs):
    ad_threshold = 0.70

    # Faça as previsões usando o modelo TensorFlow
    y_pred_prob = model.predict(X_vs)
    
    # Aplique um limite às probabilidades para determinar as previsões binárias
    y_pred = (y_pred_prob >= ad_threshold).astype(int)
    
    # Calcule a confiança como a maior probabilidade prevista (arredondada para 2 casas decimais)
    confidence = np.amax(y_pred_prob, axis=1).round(2)
    
    # Crie uma lista de listas para representar o DataFrame
    pred_list = []
    for i in range(len(y_pred)):
        pred_list.append([y_pred[i], confidence[i], confidence[i] >= ad_threshold])

    # Crie o DataFrame final
    pred = pd.DataFrame(pred_list, columns=['Prediction', 'Confidence', 'AD'])
    pred.AD[pred.AD == False] = np.nan
    pred.AD[pred.AD == True] = pred.Prediction.astype(int)
    
    return pred

In [9]:
def status_predictions(pred):
    pred_ad = pred.dropna().astype(int)
    coverage_ad = len(pred_ad) * 100 / len(pred)

    print('VS pred: %s' % Counter(pred.Prediction))
    print('VS pred AD: %s' % Counter(pred_ad.Prediction))
    print('Coverage of AD: %.2f%%' % coverage_ad)

In [10]:
def visualize_predictions(pred):
    predictions = pd.concat([moldf, pred], axis=1)
    for col in ['Prediction', 'AD']:
        predictions[col].replace(0,'Inactive',inplace=True)
        predictions[col].replace(1,'Active',inplace=True)
    return predictions

In [11]:
def export_predictions(predictions, algoritimo):
    predictions.drop(columns='Mol', inplace=True)
    with pd.ExcelWriter('../dataset/screened/pubchem_hits_qsar_'+algoritimo+'_sirms.xlsx') as writer:
        predictions.to_excel(writer, sheet_name='sirms', index=False)

In [12]:
from rdkit.Chem import PandasTools
# Set file path and format
file = '../dataset/formats/virtual_molecuke.sdf'
sdfInfo = dict(smilesName='CanonicalSMILES', molColName='ROMol')
moldf = PandasTools.LoadSDF(file, **sdfInfo)
print('Original data: ', moldf.shape)
# Rename ROMol
moldf = moldf.rename(columns={'ROMol': 'Mol'})
# Remove missing RDKit molecules
moldf = moldf[pd.notnull(moldf['Mol'])]
if 'StandardizerResult' in moldf.columns:
    moldf = moldf.drop(columns='StandardizerResult')
# Columns
print('Kept data: ', moldf.shape)
moldf.head(1)

Original data:  (101097, 12)
Kept data:  (101097, 12)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b599f0>


In [13]:
# Columns
print('Kept data: ', moldf.shape)
moldf.head(1)
from molvs.validate import Validator
fmt = '%(asctime)s - %(levelname)s - %(validation)s - %(message)s'
validator = Validator(log_format=fmt)
print('\n Problematic structures: \n', validator.validate(moldf))

Kept data:  (101097, 12)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b599f0>



 Problematic structures: 
 []


In [14]:
X_vs = pd.read_csv('../descriptors/generate/sirms/virtual_screening_sirms_descriptors.txt', sep='\t')

In [15]:
print(X_vs.shape)
X_vs.head()

(101097, 1764)


Unnamed: 0,Compounds,|S|n|||4|||elm|Br-C(-Br)-C,|S|n|||4|||elm|Br-C(-Br)-N,|S|n|||4|||elm|Br-C(-Br)=C,|S|n|||4|||elm|Br-C(-C)-C,|S|n|||4|||elm|Br-C(-C)-F,|S|n|||4|||elm|Br-C(-C)-N,|S|n|||4|||elm|Br-C(-C)-O,|S|n|||4|||elm|Br-C(-C)=C,|S|n|||4|||elm|Br-C(-C)=N,...,|S|n|||4|||elm|O.O-P=O,|S|n|||4|||elm|O.O-S-O,|S|n|||4|||elm|O.O-S=O,|S|n|||4|||elm|O.O=S-S,|S|n|||4|||elm|O.O=S=O,|S|n|||4|||elm|O.S-S-S,|S|n|||4|||elm|O=P.O=P,|S|n|||4|||elm|O=S-S=O,|S|n|||4|||elm|O=S.O=S,|S|n|||4|||elm|O=S=O.S
0,auto_generated_id_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,auto_generated_id_2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,auto_generated_id_3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,auto_generated_id_4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,auto_generated_id_5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
train_desc = pd.read_csv('../descriptors/generate/sirms/processed/sirms-chembl-alzheimer-acetilcolinesterase-processed.txt', sep='\t')
desc_list = train_desc.columns.tolist()
print(train_desc.shape)
train_desc.head()

(4829, 1384)


Unnamed: 0,|S|n|||4|||elm|B(-F)(-F)-F,|S|n|||4|||elm|B(-F)-F.C,|S|n|||4|||elm|B(-F)-F.F,|S|n|||4|||elm|B(-F)-F.N,|S|n|||4|||elm|B(-F)-F.O,|S|n|||4|||elm|B(-F)-F.S,|S|n|||4|||elm|B-F.C#N,|S|n|||4|||elm|B-F.C-C,|S|n|||4|||elm|B-F.C-F,|S|n|||4|||elm|B-F.C-N,...,|S|n|||4|||elm|O.O-P-O,|S|n|||4|||elm|O.O-P=O,|S|n|||4|||elm|O.O-S-O,|S|n|||4|||elm|O.O-S=O,|S|n|||4|||elm|O.O=S-S,|S|n|||4|||elm|O.O=S=O,|S|n|||4|||elm|O=P.O=P,|S|n|||4|||elm|O=S(=O)-S,|S|n|||4|||elm|O=S.O=S,|S|n|||4|||elm|O=S=O.S
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Filter out descriptors not present in the model

In [17]:
miss_desc = train_desc.columns.difference(X_vs.columns).tolist()
miss_desc = pd.DataFrame([[0]*len(miss_desc)]*X_vs.shape[0], columns=miss_desc)
X_vs = pd.concat([X_vs, miss_desc], axis=1)
X_vs = X_vs[desc_list]
X_vs.shape
X_vs.fillna(0, inplace=True)

(101097, 1384)

In [18]:
X_vs.head()

Unnamed: 0,|S|n|||4|||elm|B(-F)(-F)-F,|S|n|||4|||elm|B(-F)-F.C,|S|n|||4|||elm|B(-F)-F.F,|S|n|||4|||elm|B(-F)-F.N,|S|n|||4|||elm|B(-F)-F.O,|S|n|||4|||elm|B(-F)-F.S,|S|n|||4|||elm|B-F.C#N,|S|n|||4|||elm|B-F.C-C,|S|n|||4|||elm|B-F.C-F,|S|n|||4|||elm|B-F.C-N,...,|S|n|||4|||elm|O.O-P-O,|S|n|||4|||elm|O.O-P=O,|S|n|||4|||elm|O.O-S-O,|S|n|||4|||elm|O.O-S=O,|S|n|||4|||elm|O.O=S-S,|S|n|||4|||elm|O.O=S=O,|S|n|||4|||elm|O=P.O=P,|S|n|||4|||elm|O=S(=O)-S,|S|n|||4|||elm|O=S.O=S,|S|n|||4|||elm|O=S=O.S
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Load the models

In [19]:
model_rf = joblib.load('../models/pkl/cov_rf_sirms.pkl')

In [20]:
model_svm = joblib.load('../models/pkl/cov_svm_sirms.pkl')

In [21]:
model_mlp = joblib.load('../models/pkl/cov_mlp_sirms.pkl')

In [22]:
model_tf = load_model('../models/sequential_h5/sirms_melhor_modelo.h5')



2023-09-12 21:17:19.842145: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-09-12 21:17:19.842183: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-09-12 21:17:19.842200: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (note-Vostro-5490): /proc/driver/nvidia/version does not exist
2023-09-12 21:17:19.843314: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load the scaler

In [23]:
scaler = joblib.load('../models/pkl/logBB_scale_sirms.pkl')

In [24]:
X_vs = X_vs.replace([np.inf, -np.inf], np.nan)  # Substitui infinitos por NaN
X_vs = X_vs.dropna()  # Remove linhas com NaN
X_vs = scaler.fit_transform(X_vs)

In [25]:
X_vs.shape

(101097, 1384)

In [26]:
X_vs = np.nan_to_num(X_vs)

In [27]:
X_vs_df = pd.DataFrame(X_vs)

In [28]:
X_vs.shape

(101097, 1384)

In [29]:
X_vs_df.to_csv('../descriptors/generate/sirms/virtual_screening_sirms_x.csv', index=False)

In [46]:
X_vs = pd.read_csv("../descriptors/generate/sirms/virtual_screening_sirms_x.csv", delimiter=",", error_bad_lines=False)

# Predict molecules - Random forest

In [30]:
%%time
pred_rf = predictions(model_rf, X_vs)

CPU times: user 8.23 s, sys: 588 ms, total: 8.82 s
Wall time: 8.83 s


In [31]:
status_predictions(pred_rf)

VS pred: Counter({0: 51904, 1: 49193})
VS pred AD: Counter({1: 13209, 0: 12427})
Coverage of AD: 25.36%


### Visualize predictions - Random forest

In [32]:
predictions_rf = visualize_predictions(pred_rf)
predictions_rf.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,AD,Confidence
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b599f0>,Active,,0.59
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59b10>,Inactive,,0.51
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59b70>,Active,,0.54
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59c90>,Active,,0.61
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59cf0>,Active,Active,0.71


### Export SDF and Excel - Random forest

In [33]:
export_predictions(predictions_rf, 'rf')

# Predict molecules - SVM

In [34]:
%%time
pred_svm = predictions(model_svm, X_vs)

CPU times: user 7min 12s, sys: 1.42 s, total: 7min 14s
Wall time: 7min 14s


In [35]:
status_predictions(pred_svm)

VS pred: Counter({0: 58662, 1: 42435})
VS pred AD: Counter({1: 30866, 0: 22581})
Coverage of AD: 52.87%


### Visualize predictions - SVM

In [36]:
predictions_svm = visualize_predictions(pred_svm)
predictions_svm.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,AD,Confidence
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b599f0>,Inactive,,0.5
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59b10>,Active,,0.56
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59b70>,Inactive,,0.51
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59c90>,Inactive,,0.51
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59cf0>,Active,,0.58


### Export SDF and Excel - SVM

In [37]:
export_predictions(predictions_svm,'svm')

# Predict molecules - MLP

In [38]:
%%time
pred_mlp = predictions(model_mlp, X_vs)

CPU times: user 12.8 s, sys: 3.02 s, total: 15.8 s
Wall time: 2.47 s


In [39]:
status_predictions(pred_mlp)

VS pred: Counter({0: 52406, 1: 48691})
VS pred AD: Counter({0: 46430, 1: 42726})
Coverage of AD: 88.19%


### Visualize predictions - MLP

In [40]:
predictions_mlp = visualize_predictions(pred_mlp)
predictions_mlp.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,AD,Confidence
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b599f0>,Active,Active,0.87
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59b10>,Active,,0.68
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59b70>,Active,Active,0.98
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59c90>,Active,Active,1.0
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59cf0>,Active,Active,1.0


### Export SDF and Excel - MLP

In [41]:
export_predictions(predictions_mlp, 'mlp')

# Predict molecules - TF

In [42]:
%%time
pred_tf = predictions_tf(model_tf, X_vs)

2023-09-12 21:28:15.906257: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 559672992 exceeds 10% of free system memory.


CPU times: user 15.9 s, sys: 1.08 s, total: 17 s
Wall time: 8.3 s


In [69]:
status_predictions(pred_tf)

TypeError: unhashable type: 'numpy.ndarray'

In [43]:
predictions_tf = visualize_predictions(pred_tf)
predictions_tf.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,Confidence,AD
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b599f0>,Active,0.78,Active
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59b10>,Inactive,0.63,
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59b70>,Active,0.93,Active
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59c90>,Active,0.9,Active
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7feca6b59cf0>,Active,0.86,Active


In [44]:
export_predictions(predictions_tf, 'tf')

# Consensus

In [58]:
predictions_mlp = pd.read_excel("../dataset/screened/pubchem_hits_qsar_mlp_sirms.xlsx")
predictions_rf = pd.read_excel("../dataset/screened/pubchem_hits_qsar_rf_sirms.xlsx")
predictions_svm = pd.read_excel("../dataset/screened/pubchem_hits_qsar_svm_sirms.xlsx")
predictions_tf = pd.read_excel("../dataset/screened/pubchem_hits_qsar_tf_sirms.xlsx")

In [59]:
predictions_rf.rename(columns={'Prediction': 'rf', 'AD': 'rf_ad', 'Confidence': 'rf_score'}, inplace=True)
predictions_svm.rename(columns={'Prediction': 'svm', 'AD': 'svm_ad', 'Confidence': 'svm_score'}, inplace=True)
predictions_mlp.rename(columns={'Prediction': 'mlp', 'AD': 'mlp_ad', 'Confidence': 'mlp_score'}, inplace=True)
predictions_tf.rename(columns={'Prediction': 'tf', 'AD': 'tf_ad', 'Confidence': 'tf_score'}, inplace=True)

In [61]:
predictions = pd.merge(predictions_rf, predictions_svm[['CID', 'svm', 'svm_ad', 'svm_score']], how='inner', on='CID')
predictions = pd.merge(predictions, predictions_mlp[['CID', 'mlp', 'mlp_ad', 'mlp_score']], how='inner', on='CID')
predictions = pd.merge(predictions, predictions_tf[['CID', 'tf', 'tf_ad', 'tf_score']], how='inner', on='CID')

In [62]:
for col in ['rf', 'rf_ad', 'svm', 'svm_ad', 'mlp', 'mlp_ad', 'tf', 'tf_ad']:
    predictions[col].replace('Inactive',0,inplace=True)
    predictions[col].replace('Active',1,inplace=True)
predictions.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,rf_score,svm,svm_ad,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.59,0,,0.5,1,1.0,0.87,1,1.0,0.78
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,0.51,1,,0.56,1,,0.68,0,,0.63
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.54,0,,0.51,1,1.0,0.98,1,1.0,0.93
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.61,0,,0.51,1,1.0,1.0,1,1.0,0.9
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.71,1,,0.58,1,1.0,1.0,1,1.0,0.86


In [66]:
# Consensus
predictions['consensus'] = (predictions.rf + predictions.svm + predictions.mlp + predictions.tf)/4
predictions['consensus'] = np.where(predictions['consensus'] > 0.5, 1, np.where(predictions['consensus'] == 0.5, np.nan, 0))

In [73]:
# Consensus AD
# Crie uma função para calcular a média dos valores não nulos em cada linha
def calculate_consensus(row):
    values = row[['rf_ad', 'svm_ad', 'mlp_ad', 'tf_ad']].dropna()
    if len(values) == 0:
        return np.nan
    return values.mean()

# Aplique a função a cada linha do DataFrame
predictions['consensus_ad'] = predictions.apply(calculate_consensus, axis=1)

# Arredonde os valores para 0 ou 1 com base no limite de 0,5
predictions['consensus_ad'] = np.where(predictions['consensus_ad'] > 0.5, 1, np.where(predictions['consensus_ad'] == 0.5, np.nan, 0))

In [74]:
predictions[predictions['consensus'] == 1]

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
0,6,16723801,1,2,389.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.50,1,1.0,0.87,1,1.0,0.78,1.0,1.0,2.0
2,39,145950271,4,3,450.00,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.51,1,1.0,0.98,1,1.0,0.93,1.0,1.0,2.0
3,46,118705964,2,4,448.00,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.51,1,1.0,1.00,1,1.0,0.90,1.0,1.0,2.0
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.58,1,1.0,1.00,1,1.0,0.86,1.0,1.0,2.0
5,56,42743569,2,3,367.90,4.4,O=C(Cc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,2,1,...,0.57,1,1.0,1.00,1,1.0,0.80,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101077,158532,71643898,2,4,247.36,3.0,NC1CCC(Nc2nsc3ccccc23)CC1,C1CC(CCC1N)NC2=NSC3=CC=CC=C32,1,1,...,0.53,1,1.0,1.00,1,1.0,0.97,1.0,1.0,2.0
101078,158534,71644150,2,4,247.36,3.0,NC1CCCCC1Nc1nsc2ccccc12,C1CCC(C(C1)N)NC2=NSC3=CC=CC=C32,1,1,...,0.53,1,1.0,1.00,1,1.0,0.97,1.0,1.0,2.0
101082,158542,71644591,1,4,247.36,2.6,NCC1CCCN(c2nsc3ccccc23)C1,C1CC(CN(C1)C2=NSC3=CC=CC=C32)CN,1,1,...,0.53,1,1.0,1.00,1,1.0,0.97,1.0,1.0,2.0
101084,158546,71644884,1,4,247.36,2.6,NCC1CCN(c2nsc3ccccc23)CC1,C1CN(CCC1CN)C2=NSC3=CC=CC=C32,1,1,...,0.53,1,1.0,1.00,1,1.0,0.97,1.0,1.0,2.0


In [69]:
predictions.head(100)

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_ad,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,,0.50,1,1.0,0.87,1,1.0,0.78,1.0,1.0
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,,0.56,1,,0.68,0,,0.63,,0.0
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,,0.51,1,1.0,0.98,1,1.0,0.93,1.0,1.0
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,,0.51,1,1.0,1.00,1,1.0,0.90,1.0,1.0
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,,0.58,1,1.0,1.00,1,1.0,0.86,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,304,113087103,1,1,385.3,4.6,O=C(Cc1ccc(Cl)cc1)N1CC=C(c2c[nH]c3cc(Cl)ccc23)CC1,C1CN(CC=C1C2=CNC3=C2C=CC(=C3)Cl)C(=O)CC4=CC=C(...,7,1,...,,0.53,1,1.0,0.96,1,1.0,0.74,,1.0
96,305,113087105,1,1,364.9,4.2,O=C(CCc1ccccc1)N1CC=C(c2c[nH]c3cc(Cl)ccc23)CC1,C1CN(CC=C1C2=CNC3=C2C=CC(=C3)Cl)C(=O)CCC4=CC=C...,10,1,...,,0.50,1,1.0,0.99,1,1.0,0.76,1.0,1.0
97,306,113087681,1,1,344.9,4.8,O=C(C1CCCCC1)N1CCC(c2c[nH]c3cc(Cl)ccc23)CC1,C1CCC(CC1)C(=O)N2CCC(CC2)C3=CNC4=C3C=CC(=C4)Cl,10,1,...,,0.52,1,1.0,0.99,1,1.0,0.97,1.0,1.0
98,307,113087702,1,1,352.9,4.3,O=C(Cc1ccccc1)N1CCC(c2c[nH]c3cc(Cl)ccc23)CC1,C1CN(CCC1C2=CNC3=C2C=CC(=C3)Cl)C(=O)CC4=CC=CC=C4,11,1,...,,0.55,1,1.0,1.00,1,1.0,0.86,1.0,1.0


In [70]:
predictions['count']=pd.concat([predictions['consensus'],predictions['consensus_ad']],axis=1).sum(axis=1)
predictions

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
0,6,16723801,1,2,389.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.50,1,1.0,0.87,1,1.0,0.78,1.0,1.0,2.0
1,14,4782931,2,1,326.80,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,0.56,1,,0.68,0,,0.63,,0.0,0.0
2,39,145950271,4,3,450.00,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.51,1,1.0,0.98,1,1.0,0.93,1.0,1.0,2.0
3,46,118705964,2,4,448.00,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.51,1,1.0,1.00,1,1.0,0.90,1.0,1.0,2.0
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.58,1,1.0,1.00,1,1.0,0.86,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101092,158588,140116108,1,4,261.39,3.5,c1ccc2c(NCCN3CCCCC3)nsc2c1,C1CCN(CC1)CCNC2=NSC3=CC=CC=C32,1,1,...,0.53,1,1.0,1.00,1,1.0,0.97,,1.0,1.0
101093,158589,140732880,0,5,232.33,2.6,[CH2-]N1CCN(c2nsc3ccccc23)CC1,[CH2-]N1CCN(CC1)C2=NSC3=CC=CC=C32,1,1,...,0.53,1,1.0,0.95,1,1.0,0.91,,1.0,1.0
101094,158590,143285344,2,4,221.32,3.1,CNCCCNc1nsc2ccccc12,CNCCCNC1=NSC2=CC=CC=C21,1,1,...,0.53,1,1.0,1.00,1,1.0,0.95,,1.0,1.0
101095,158591,144116719,1,5,346.50,4.0,CCC(CN)C(CC)CN1CCN(c2nsc3ccccc23)CC1,CC[C@@H](CN1CCN(CC1)C2=NSC3=CC=CC=C32)C(CC)CN,1,1,...,0.53,1,1.0,1.00,1,1.0,0.99,1.0,1.0,2.0


In [71]:
# selecting rows based on condition 
hits = predictions[predictions['count'] == 2.0] 
print(hits.shape)
hits

(28182, 26)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
0,6,16723801,1,2,389.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.50,1,1.0,0.87,1,1.0,0.78,1.0,1.0,2.0
2,39,145950271,4,3,450.00,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.51,1,1.0,0.98,1,1.0,0.93,1.0,1.0,2.0
3,46,118705964,2,4,448.00,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.51,1,1.0,1.00,1,1.0,0.90,1.0,1.0,2.0
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.58,1,1.0,1.00,1,1.0,0.86,1.0,1.0,2.0
5,56,42743569,2,3,367.90,4.4,O=C(Cc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,2,1,...,0.57,1,1.0,1.00,1,1.0,0.80,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101077,158532,71643898,2,4,247.36,3.0,NC1CCC(Nc2nsc3ccccc23)CC1,C1CC(CCC1N)NC2=NSC3=CC=CC=C32,1,1,...,0.53,1,1.0,1.00,1,1.0,0.97,1.0,1.0,2.0
101078,158534,71644150,2,4,247.36,3.0,NC1CCCCC1Nc1nsc2ccccc12,C1CCC(C(C1)N)NC2=NSC3=CC=CC=C32,1,1,...,0.53,1,1.0,1.00,1,1.0,0.97,1.0,1.0,2.0
101082,158542,71644591,1,4,247.36,2.6,NCC1CCCN(c2nsc3ccccc23)C1,C1CC(CN(C1)C2=NSC3=CC=CC=C32)CN,1,1,...,0.53,1,1.0,1.00,1,1.0,0.97,1.0,1.0,2.0
101084,158546,71644884,1,4,247.36,2.6,NCC1CCN(c2nsc3ccccc23)CC1,C1CN(CCC1CN)C2=NSC3=CC=CC=C32,1,1,...,0.53,1,1.0,1.00,1,1.0,0.97,1.0,1.0,2.0


In [72]:
with pd.ExcelWriter('../dataset/screened/pubchem_hits_qsar_sirms_consensus.xlsx') as writer:
    predictions.to_excel(writer, sheet_name='consensus', index=False)