In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import _pickle as cPickle
import gzip
import joblib

from collections import Counter

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.model_selection import permutation_test_score, StratifiedKFold

def warn(*args, **kwargs):
    pass
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn
from tensorflow.keras.models import load_model
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%reload_ext autoreload
%autoreload 2
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

2023-09-12 19:49:51.488803: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-09-12 19:49:51.488824: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.kernel.execute("set_timeout(60000)")'))

<IPython.core.display.Javascript object>

# Selecione o Descritor

In [3]:
descritores = "morgan"
#descritores = "sirms"
#descritores = "padel"
#descritores = "rdkit"

In [4]:
from sklearn import metrics

def stats(y_test, y_pred):
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred, labels=[0,1])
    Kappa = metrics.cohen_kappa_score(y_test, y_pred, weights='linear')
    # Valores verdadeiros e falsos
    TN, FP, FN, TP = confusion_matrix.ravel()
    # Accuracy
    AC = (TP+TN)/(TP+FP+FN+TN)
    # Sensibilidade, taxa de acerto, recall ou taxa positiva verdadeira
    SE = TP/(TP+FN)
    # Especificidade ou taxa negativa verdadeira
    SP = TN/(TN+FP)
    # Precisão ou valor preditivo positivo
    PPV = TP/(TP+FP)
    # Valor preditivo negativo
    NPV = TN/(TN+FN)
    # Taxa de classificação correta
    CCR = (SE + SP)/2   
    # F1 Score
    F1_score = 2*(PPV*SE)/(PPV+SE)
    d = dict({'Kappa': Kappa,
         'AUC': CCR,
         'Sensibilidade': SE,
         'PPV': PPV,
         'Especificidade': SP,
         'NPV': NPV,
         'Acurácia': AC,
         'F1 Score':F1_score})
    return pd.DataFrame(d, columns=d.keys(), index=[0]).round(2)

In [5]:
def predictions(model, X_vs):
    ad_threshold = 0.70

    y_pred = model.predict(X_vs)
    confidence = model.predict_proba(X_vs)
    confidence = np.amax(confidence, axis=1).round(2)
    ad = confidence >= ad_threshold

    pred = pd.DataFrame({'Prediction': y_pred, 'AD': ad, 'Confidence': confidence}, index=None)
    pred.AD[pred.AD == False] = np.nan
    pred.AD[pred.AD == True] = pred.Prediction.astype(int)
    return pred

In [6]:
def predictions_tf(model, X_vs):
    ad_threshold = 0.70

    # Faça as previsões usando o modelo TensorFlow
    y_pred_prob = model.predict(X_vs)
    
    # Aplique um limite às probabilidades para determinar as previsões binárias
    y_pred = (y_pred_prob >= ad_threshold).astype(int)
    
    # Calcule a confiança como a maior probabilidade prevista (arredondada para 2 casas decimais)
    confidence = np.amax(y_pred_prob, axis=1).round(2)
    
    # Crie uma lista de listas para representar o DataFrame
    pred_list = []
    for i in range(len(y_pred)):
        pred_list.append([y_pred[i], confidence[i], confidence[i] >= ad_threshold])

    # Crie o DataFrame final
    pred = pd.DataFrame(pred_list, columns=['Prediction', 'Confidence', 'AD'])
    pred.AD[pred.AD == False] = np.nan
    pred.AD[pred.AD == True] = pred.Prediction.astype(int)
    
    return pred

In [7]:
def status_predictions(pred):
    pred_ad = pred.dropna().astype(int)
    coverage_ad = len(pred_ad) * 100 / len(pred)

    print('VS pred: %s' % Counter(pred.Prediction))
    print('VS pred AD: %s' % Counter(pred_ad.Prediction))
    print('Coverage of AD: %.2f%%' % coverage_ad)

In [8]:
def visualize_predictions(pred):
    predictions = pd.concat([moldf, pred], axis=1)
    for col in ['Prediction', 'AD']:
        predictions[col].replace(0,'Inactive',inplace=True)
        predictions[col].replace(1,'Active',inplace=True)
    return predictions

In [9]:
def export_predictions(predictions, algoritimo):
    predictions.drop(columns='Mol', inplace=True)
    with pd.ExcelWriter('../dataset/screened/pubchem_hits_qsar_'+algoritimo+'_morgan.xlsx') as writer:
        predictions.to_excel(writer, sheet_name='morgan', index=False)

In [10]:
from rdkit.Chem import PandasTools
# Set file path and format
file = '../dataset/formats/virtual_molecuke.sdf'
sdfInfo = dict(smilesName='CanonicalSMILES', molColName='ROMol')
moldf = PandasTools.LoadSDF(file, **sdfInfo)
print('Original data: ', moldf.shape)
# Rename ROMol
moldf = moldf.rename(columns={'ROMol': 'Mol'})
# Remove missing RDKit molecules
moldf = moldf[pd.notnull(moldf['Mol'])]
if 'StandardizerResult' in moldf.columns:
    moldf = moldf.drop(columns='StandardizerResult')
# Columns
print('Kept data: ', moldf.shape)
moldf.head(1)

Original data:  (101097, 12)
Kept data:  (101097, 12)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7fbddc48d330>


In [11]:
# Columns
print('Kept data: ', moldf.shape)
moldf.head(1)
from molvs.validate import Validator
fmt = '%(asctime)s - %(levelname)s - %(validation)s - %(message)s'
validator = Validator(log_format=fmt)
print('\n Problematic structures: \n', validator.validate(moldf))

Kept data:  (101097, 12)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7fbddc48d330>



 Problematic structures: 
 []


In [12]:
#def calcfp(mol,funcFPInfo=dict(radius=3, nBits=2048, useFeatures=False, useChirality=False)):
#    fp = AllChem.GetMorganFingerprintAsBitVect(mol, **funcFPInfo)
#    fp = pd.Series(np.asarray(fp))
#    fp = fp.add_prefix('Bit_')
#    return fp

import pandas as pd

# Especifique o tamanho da parte
#tamanho_da_parte = 10000  # Por exemplo, divida em partes de 10.000 linhas

# Calcule o número total de partes necessárias
#num_partes = len(moldf) // tamanho_da_parte + 1

# Divida o conjunto de dados em partes menores e processe cada parte
#resultados_intermediarios = []
#for i in range(num_partes):
#    inicio = i * tamanho_da_parte
#    fim = (i + 1) * tamanho_da_parte
#    parte = moldf[inicio:fim]
    
    # Realize o processamento na parte atual (substitua por sua lógica de processamento)
#    parte_processada = parte.Mol.apply(calcfp)
    
    # Armazene os resultados intermediários em uma lista
#    resultados_intermediarios.append(parte_processada)

# Combine os resultados intermediários em um único DataFrame
#resultado_final = pd.concat(resultados_intermediarios, ignore_index=True)

#resultado_final.to_csv('../descriptors/generate/morgan/virtual_screening_morgan_descriptors.csv', index=False)
# Leia o arquivo CSV e crie o DataFrame
X_vs = pd.read_csv("../descriptors/generate/morgan/virtual_screening_morgan_descriptors.csv", delimiter=",", error_bad_lines=False)

In [13]:
X_vs.index

RangeIndex(start=0, stop=101097, step=1)

In [14]:
X_vs.index = range(len(X_vs))

In [15]:
X_vs.head()

Unnamed: 0,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,Bit_9,...,Bit_2038,Bit_2039,Bit_2040,Bit_2041,Bit_2042,Bit_2043,Bit_2044,Bit_2045,Bit_2046,Bit_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Load the models

In [12]:
model_rf = joblib.load('../models/pkl/cov_rf_morgan.pkl')

In [13]:
model_svm = joblib.load('../models/pkl/cov_svm_morgan.pkl')

In [14]:
model_mlp = joblib.load('../models/pkl/cov_mlp_morgan.pkl')

In [15]:
model_tf = load_model('../models/sequential_h5/morgan_melhor_modelo.h5')



2023-09-12 19:36:17.294287: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-09-12 19:36:17.294310: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-09-12 19:36:17.294322: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (note-Vostro-5490): /proc/driver/nvidia/version does not exist
2023-09-12 19:36:17.294490: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load the scaler

In [20]:
scaler = joblib.load('../models/pkl/logBB_scale_morgan.pkl')

In [21]:
X_vs = scaler.fit_transform(X_vs)

In [22]:
X_vs.shape

(101097, 2048)

In [23]:
X_vs = np.nan_to_num(X_vs)

In [24]:
X_vs_df = pd.DataFrame(X_vs)

In [25]:
X_vs_df.to_csv('../descriptors/generate/morgan/virtual_screening_morgan_x.csv', index=False)

In [16]:
X_vs = pd.read_csv("../descriptors/generate/morgan/virtual_screening_morgan_x.csv", delimiter=",", error_bad_lines=False)

# Predict molecules - Random forest

In [17]:
%%time
pred_rf = predictions(model_rf, X_vs)

CPU times: user 8.16 s, sys: 0 ns, total: 8.16 s
Wall time: 8.17 s


In [18]:
status_predictions(pred_rf)

VS pred: Counter({0: 100976, 1: 121})
VS pred AD: Counter({0: 41060})
Coverage of AD: 40.61%


### Visualize predictions - Random forest

In [19]:
predictions_rf = visualize_predictions(pred_rf)
predictions_rf.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,AD,Confidence
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b330>,Inactive,,0.66
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b450>,Inactive,,0.68
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b4b0>,Inactive,,0.63
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b5d0>,Inactive,,0.62
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b630>,Inactive,,0.62


### Export SDF and Excel - Random forest

In [20]:
export_predictions(predictions_rf, 'rf')

# Predict molecules - SVM

In [21]:
%%time
pred_svm = predictions(model_svm, X_vs)

CPU times: user 7min 31s, sys: 1.37 s, total: 7min 32s
Wall time: 7min 32s


In [22]:
status_predictions(pred_svm)

VS pred: Counter({1: 59282, 0: 41815})
VS pred AD: Counter({1: 45152, 0: 25037})
Coverage of AD: 69.43%


### Visualize predictions - SVM

In [23]:
predictions_svm = visualize_predictions(pred_svm)
predictions_svm.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,AD,Confidence
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b330>,Inactive,,0.62
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b450>,Active,Active,0.99
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b4b0>,Inactive,,0.66
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b5d0>,Active,Active,0.95
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b630>,Active,Active,0.85


### Export SDF and Excel - SVM

In [24]:
export_predictions(predictions_svm,'svm')

# Predict molecules - MLP

In [25]:
%%time
pred_mlp = predictions(model_mlp, X_vs)

CPU times: user 17.6 s, sys: 1.87 s, total: 19.4 s
Wall time: 2.62 s


In [26]:
status_predictions(pred_mlp)

VS pred: Counter({1: 50566, 0: 50531})
VS pred AD: Counter({0: 41262, 1: 41198})
Coverage of AD: 81.57%


### Visualize predictions - MLP

In [27]:
predictions_mlp = visualize_predictions(pred_mlp)
predictions_mlp.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,AD,Confidence
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b330>,Inactive,Inactive,0.78
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b450>,Active,Active,1.0
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b4b0>,Inactive,,0.65
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b5d0>,Active,Active,0.98
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b630>,Active,Active,0.85


### Export SDF and Excel - MLP

In [28]:
export_predictions(predictions_mlp, 'mlp')

# Predict molecules - TF

In [29]:
%%time
pred_tf = predictions_tf(model_tf, X_vs)

   1/3160 [..............................] - ETA: 7:58

2023-09-12 19:46:23.303032: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1656373248 exceeds 10% of free system memory.


CPU times: user 21.1 s, sys: 1.21 s, total: 22.3 s
Wall time: 10.5 s


In [30]:
status_predictions(pred_tf)

TypeError: unhashable type: 'numpy.ndarray'

In [31]:
predictions_tf = visualize_predictions(pred_tf)
predictions_tf.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,Confidence,AD
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b330>,Inactive,0.2,
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b450>,Active,0.94,Active
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b4b0>,Inactive,0.41,
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b5d0>,Active,0.98,Active
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7efcf363b630>,Inactive,0.52,


In [None]:
export_predictions(predictions_tf, 'tf')

# Consensus

In [2]:
import pandas as pd
predictions_mlp = pd.read_excel("../dataset/screened/pubchem_hits_qsar_mlp_morgan.xlsx")
predictions_rf = pd.read_excel("../dataset/screened/pubchem_hits_qsar_rf_morgan.xlsx")
predictions_svm = pd.read_excel("../dataset/screened/pubchem_hits_qsar_svm_morgan.xlsx")
predictions_tf = pd.read_excel("../dataset/screened/pubchem_hits_qsar_tf_morgan.xlsx")

In [3]:
predictions_rf.rename(columns={'Prediction': 'rf', 'AD': 'rf_ad', 'Confidence': 'rf_score'}, inplace=True)
predictions_svm.rename(columns={'Prediction': 'svm', 'AD': 'svm_ad', 'Confidence': 'svm_score'}, inplace=True)
predictions_mlp.rename(columns={'Prediction': 'mlp', 'AD': 'mlp_ad', 'Confidence': 'mlp_score'}, inplace=True)
predictions_tf.rename(columns={'Prediction': 'tf', 'AD': 'tf_ad', 'Confidence': 'tf_score'}, inplace=True)

In [4]:
predictions = pd.merge(predictions_rf, predictions_svm[['CID', 'svm', 'svm_ad', 'svm_score']], how='inner', on='CID')
predictions = pd.merge(predictions, predictions_mlp[['CID', 'mlp', 'mlp_ad', 'mlp_score']], how='inner', on='CID')
predictions = pd.merge(predictions, predictions_tf[['CID', 'tf', 'tf_ad', 'tf_score']], how='inner', on='CID')

In [5]:
for col in ['rf', 'rf_ad', 'svm', 'svm_ad', 'mlp', 'mlp_ad', 'tf', 'tf_ad']:
    predictions[col].replace('Inactive',0,inplace=True)
    predictions[col].replace('Active',1,inplace=True)
predictions.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,rf_score,svm,svm_ad,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.66,0,,0.62,0,0.0,0.78,0,,0.2
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,0.68,1,1.0,0.99,1,1.0,1.0,1,1.0,0.94
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.63,0,,0.66,0,,0.65,0,,0.41
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.62,1,1.0,0.95,1,1.0,0.98,1,1.0,0.98
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.62,1,1.0,0.85,1,1.0,0.85,0,,0.52


In [6]:
import numpy as np
# Consensus
predictions['consensus'] = (predictions.rf + predictions.svm + predictions.mlp + predictions.tf)/4
predictions['consensus'] = np.where(predictions['consensus'] > 0.5, 1, np.where(predictions['consensus'] == 0.5, np.nan, 0))

In [7]:
# Consensus AD
# Crie uma função para calcular a média dos valores não nulos em cada linha
def calculate_consensus(row):
    values = row[['rf_ad', 'svm_ad', 'mlp_ad', 'tf_ad']].dropna()
    if len(values) == 0:
        return np.nan
    return values.mean()

# Aplique a função a cada linha do DataFrame
predictions['consensus_ad'] = predictions.apply(calculate_consensus, axis=1)

# Arredonde os valores para 0 ou 1 com base no limite de 0,5
predictions['consensus_ad'] = np.where(predictions['consensus_ad'] > 0.5, 1, np.where(predictions['consensus_ad'] == 0.5, np.nan, 0))

In [8]:
predictions[predictions['consensus'] == 1]

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_ad,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad
1,14,4782931,2,1,326.80,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,1.0,0.99,1,1.0,1.00,1,1.0,0.94,1.0,1.0
3,46,118705964,2,4,448.00,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,1.0,0.95,1,1.0,0.98,1,1.0,0.98,1.0,1.0
14,86,142055963,1,2,395.90,4.3,CN(C)CC1Cc2c(Cl)cccc2N(C(=O)CCc2c[nH]c3ccccc23)C1,CN(C)CC1CC2=C(C=CC=C2Cl)N(C1)C(=O)CCC3=CNC4=CC...,12,1,...,1.0,0.99,1,1.0,1.00,1,1.0,1.00,1.0,1.0
15,88,141588898,2,3,410.90,3.4,CN(C)CC1Cc2c(Cl)cccc2N(C(=O)C(N)Cc2c[nH]c3cccc...,CN(C)CC1CC2=C(C=CC=C2Cl)N(C1)C(=O)[C@@H](CC3=C...,9,1,...,1.0,0.95,1,1.0,0.92,1,1.0,0.99,1.0,1.0
19,101,91601742,2,4,454.90,4.9,CCNCc1ccc(N=C(c2ccc3ncccc3c2)C2C(=O)Nc3cc(Cl)c...,CCNCC1=CC=C(C=C1)N=C(C2C3=C(C=C(C=C3)Cl)NC2=O)...,5,1,...,1.0,0.95,1,1.0,0.93,1,1.0,0.87,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101086,158551,72699874,2,5,417.60,4.2,CNC(=S)NC1CCC(CCN2CCN(c3nsc4ccccc34)CC2)CC1,CNC(=S)NC1CCC(CC1)CCN2CCN(CC2)C3=NSC4=CC=CC=C43,1,1,...,,0.61,1,1.0,0.95,1,1.0,0.73,1.0,1.0
101088,158553,83637058,1,3,192.28,3.4,CC(C)Nc1nsc2ccccc12,CC(C)NC1=NSC2=CC=CC=C21,1,1,...,1.0,0.93,1,1.0,0.96,1,1.0,0.87,1.0,1.0
101089,158554,94830332,2,4,261.35,1.6,N=C(N)N1CCN(c2nsc3ccccc23)CC1,C1CN(CCN1C2=NSC3=CC=CC=C32)C(=N)N,1,1,...,1.0,0.92,1,1.0,0.99,1,1.0,0.90,1.0,1.0
101090,158585,129897191,1,4,219.31,2.6,c1ccc2c(N3CCCNC3)nsc2c1,C1CNCN(C1)C2=NSC3=CC=CC=C32,1,1,...,1.0,0.98,1,1.0,0.97,1,1.0,0.88,1.0,1.0


In [9]:
predictions.head(100)

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_ad,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,,0.62,0,0.0,0.78,0,,0.20,0.0,0.0
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,1.0,0.99,1,1.0,1.00,1,1.0,0.94,1.0,1.0
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,,0.66,0,,0.65,0,,0.41,0.0,0.0
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,1.0,0.95,1,1.0,0.98,1,1.0,0.98,1.0,1.0
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,1.0,0.85,1,1.0,0.85,0,,0.52,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,304,113087103,1,1,385.3,4.6,O=C(Cc1ccc(Cl)cc1)N1CC=C(c2c[nH]c3cc(Cl)ccc23)CC1,C1CN(CC=C1C2=CNC3=C2C=CC(=C3)Cl)C(=O)CC4=CC=C(...,7,1,...,1.0,0.92,0,0.0,0.95,0,,0.04,0.0,
96,305,113087105,1,1,364.9,4.2,O=C(CCc1ccccc1)N1CC=C(c2c[nH]c3cc(Cl)ccc23)CC1,C1CN(CC=C1C2=CNC3=C2C=CC(=C3)Cl)C(=O)CCC4=CC=C...,10,1,...,1.0,0.90,0,0.0,0.72,0,,0.19,0.0,
97,306,113087681,1,1,344.9,4.8,O=C(C1CCCCC1)N1CCC(c2c[nH]c3cc(Cl)ccc23)CC1,C1CCC(CC1)C(=O)N2CCC(CC2)C3=CNC4=C3C=CC(=C4)Cl,10,1,...,1.0,0.85,0,,0.62,0,,0.07,0.0,1.0
98,307,113087702,1,1,352.9,4.3,O=C(Cc1ccccc1)N1CCC(c2c[nH]c3cc(Cl)ccc23)CC1,C1CN(CCC1C2=CNC3=C2C=CC(=C3)Cl)C(=O)CC4=CC=CC=C4,11,1,...,,0.59,0,0.0,0.93,0,,0.02,0.0,0.0


In [10]:
predictions['count']=pd.concat([predictions['consensus'],predictions['consensus_ad']],axis=1).sum(axis=1)
predictions

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
0,6,16723801,1,2,389.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.62,0,0.0,0.78,0,,0.20,0.0,0.0,0.0
1,14,4782931,2,1,326.80,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,0.99,1,1.0,1.00,1,1.0,0.94,1.0,1.0,2.0
2,39,145950271,4,3,450.00,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.66,0,,0.65,0,,0.41,0.0,0.0,0.0
3,46,118705964,2,4,448.00,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.95,1,1.0,0.98,1,1.0,0.98,1.0,1.0,2.0
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.85,1,1.0,0.85,0,,0.52,,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101092,158588,140116108,1,4,261.39,3.5,c1ccc2c(NCCN3CCCCC3)nsc2c1,C1CCN(CC1)CCNC2=NSC3=CC=CC=C32,1,1,...,0.78,0,,0.55,0,,0.13,0.0,1.0,1.0
101093,158589,140732880,0,5,232.33,2.6,[CH2-]N1CCN(c2nsc3ccccc23)CC1,[CH2-]N1CCN(CC1)C2=NSC3=CC=CC=C32,1,1,...,0.86,1,1.0,0.70,0,,0.51,,1.0,1.0
101094,158590,143285344,2,4,221.32,3.1,CNCCCNc1nsc2ccccc12,CNCCCNC1=NSC2=CC=CC=C21,1,1,...,0.57,0,0.0,0.96,0,,0.19,0.0,0.0,0.0
101095,158591,144116719,1,5,346.50,4.0,CCC(CN)C(CC)CN1CCN(c2nsc3ccccc23)CC1,CC[C@@H](CN1CCN(CC1)C2=NSC3=CC=CC=C32)C(CC)CN,1,1,...,0.97,1,1.0,0.84,0,,0.67,,1.0,1.0


In [11]:
# selecting rows based on condition 
hits = predictions[predictions['count'] == 2.0] 
print(hits.shape)
hits

(28965, 26)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
1,14,4782931,2,1,326.80,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,0.99,1,1.0,1.00,1,1.0,0.94,1.0,1.0,2.0
3,46,118705964,2,4,448.00,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.95,1,1.0,0.98,1,1.0,0.98,1.0,1.0,2.0
14,86,142055963,1,2,395.90,4.3,CN(C)CC1Cc2c(Cl)cccc2N(C(=O)CCc2c[nH]c3ccccc23)C1,CN(C)CC1CC2=C(C=CC=C2Cl)N(C1)C(=O)CCC3=CNC4=CC...,12,1,...,0.99,1,1.0,1.00,1,1.0,1.00,1.0,1.0,2.0
15,88,141588898,2,3,410.90,3.4,CN(C)CC1Cc2c(Cl)cccc2N(C(=O)C(N)Cc2c[nH]c3cccc...,CN(C)CC1CC2=C(C=CC=C2Cl)N(C1)C(=O)[C@@H](CC3=C...,9,1,...,0.95,1,1.0,0.92,1,1.0,0.99,1.0,1.0,2.0
19,101,91601742,2,4,454.90,4.9,CCNCc1ccc(N=C(c2ccc3ncccc3c2)C2C(=O)Nc3cc(Cl)c...,CCNCC1=CC=C(C=C1)N=C(C2C3=C(C=C(C=C3)Cl)NC2=O)...,5,1,...,0.95,1,1.0,0.93,1,1.0,0.87,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101086,158551,72699874,2,5,417.60,4.2,CNC(=S)NC1CCC(CCN2CCN(c3nsc4ccccc34)CC2)CC1,CNC(=S)NC1CCC(CC1)CCN2CCN(CC2)C3=NSC4=CC=CC=C43,1,1,...,0.61,1,1.0,0.95,1,1.0,0.73,1.0,1.0,2.0
101088,158553,83637058,1,3,192.28,3.4,CC(C)Nc1nsc2ccccc12,CC(C)NC1=NSC2=CC=CC=C21,1,1,...,0.93,1,1.0,0.96,1,1.0,0.87,1.0,1.0,2.0
101089,158554,94830332,2,4,261.35,1.6,N=C(N)N1CCN(c2nsc3ccccc23)CC1,C1CN(CCN1C2=NSC3=CC=CC=C32)C(=N)N,1,1,...,0.92,1,1.0,0.99,1,1.0,0.90,1.0,1.0,2.0
101090,158585,129897191,1,4,219.31,2.6,c1ccc2c(N3CCCNC3)nsc2c1,C1CNCN(C1)C2=NSC3=CC=CC=C32,1,1,...,0.98,1,1.0,0.97,1,1.0,0.88,1.0,1.0,2.0


In [12]:
with pd.ExcelWriter('../dataset/screened/pubchem_hits_qsar_morgan_consensus.xlsx') as writer:
    predictions.to_excel(writer, sheet_name='consensus', index=False)