In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import _pickle as cPickle
import gzip
import joblib

from collections import Counter

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.model_selection import permutation_test_score, StratifiedKFold

def warn(*args, **kwargs):
    pass
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn
from tensorflow.keras.models import load_model
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%reload_ext autoreload
%autoreload 2
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [22]:
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.kernel.execute("set_timeout(60000)")'))

<IPython.core.display.Javascript object>

# Selecione o Descritor

In [23]:
#descritores = "morgan"
#descritores = "sirms"
#descritores = "padel"
descritores = "rdkit"

In [24]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [25]:
def variable_adjustments(desc):
    desc.drop(desc.columns[0:1], axis=1,inplace=True)
    ##### Remover variáveis constantes e quase constantes
    desc = desc.select_dtypes(exclude=['object'])
    desc = desc.dropna(axis=1, how='any')
    desc = desc.fillna(0)

    desc.fillna(desc.mean())

    ##### Remover variáveis correlacionadas
    correlated_features = set()  
    correlation_matrix = desc.corr()

    for i in range(len(correlation_matrix.columns)):  
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > 0.9:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)

    desc.drop(labels=correlated_features, axis=1, inplace=True)
    return desc

In [26]:
from sklearn import metrics

def stats(y_test, y_pred):
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred, labels=[0,1])
    Kappa = metrics.cohen_kappa_score(y_test, y_pred, weights='linear')
    # Valores verdadeiros e falsos
    TN, FP, FN, TP = confusion_matrix.ravel()
    # Accuracy
    AC = (TP+TN)/(TP+FP+FN+TN)
    # Sensibilidade, taxa de acerto, recall ou taxa positiva verdadeira
    SE = TP/(TP+FN)
    # Especificidade ou taxa negativa verdadeira
    SP = TN/(TN+FP)
    # Precisão ou valor preditivo positivo
    PPV = TP/(TP+FP)
    # Valor preditivo negativo
    NPV = TN/(TN+FN)
    # Taxa de classificação correta
    CCR = (SE + SP)/2   
    # F1 Score
    F1_score = 2*(PPV*SE)/(PPV+SE)
    d = dict({'Kappa': Kappa,
         'AUC': CCR,
         'Sensibilidade': SE,
         'PPV': PPV,
         'Especificidade': SP,
         'NPV': NPV,
         'Acurácia': AC,
         'F1 Score':F1_score})
    return pd.DataFrame(d, columns=d.keys(), index=[0]).round(2)

In [27]:
def predictions(model, X_vs):
    ad_threshold = 0.70

    y_pred = model.predict(X_vs)
    confidence = model.predict_proba(X_vs)
    confidence = np.amax(confidence, axis=1).round(2)
    ad = confidence >= ad_threshold

    pred = pd.DataFrame({'Prediction': y_pred, 'AD': ad, 'Confidence': confidence}, index=None)
    pred.AD[pred.AD == False] = np.nan
    pred.AD[pred.AD == True] = pred.Prediction.astype(int)
    return pred

In [28]:
def predictions_tf(model, X_vs):
    ad_threshold = 0.70

    # Faça as previsões usando o modelo TensorFlow
    y_pred_prob = model.predict(X_vs)
    
    # Aplique um limite às probabilidades para determinar as previsões binárias
    y_pred = (y_pred_prob >= ad_threshold).astype(int)
    
    # Calcule a confiança como a maior probabilidade prevista (arredondada para 2 casas decimais)
    confidence = np.amax(y_pred_prob, axis=1).round(2)
    
    # Crie uma lista de listas para representar o DataFrame
    pred_list = []
    for i in range(len(y_pred)):
        pred_list.append([y_pred[i], confidence[i], confidence[i] >= ad_threshold])

    # Crie o DataFrame final
    pred = pd.DataFrame(pred_list, columns=['Prediction', 'Confidence', 'AD'])
    pred.AD[pred.AD == False] = np.nan
    pred.AD[pred.AD == True] = pred.Prediction.astype(int)
    
    return pred

In [29]:
def status_predictions(pred):
    pred_ad = pred.dropna().astype(int)
    coverage_ad = len(pred_ad) * 100 / len(pred)

    print('VS pred: %s' % Counter(pred.Prediction))
    print('VS pred AD: %s' % Counter(pred_ad.Prediction))
    print('Coverage of AD: %.2f%%' % coverage_ad)

In [30]:
def visualize_predictions(pred):
    predictions = pd.concat([moldf, pred], axis=1)
    for col in ['Prediction', 'AD']:
        predictions[col].replace(0,'Inactive',inplace=True)
        predictions[col].replace(1,'Active',inplace=True)
    return predictions

In [31]:
def export_predictions(predictions, algoritimo):
    predictions.drop(columns='Mol', inplace=True)
    with pd.ExcelWriter('../dataset/screened/pubchem_hits_qsar_'+algoritimo+'_rdkit.xlsx') as writer:
        predictions.to_excel(writer, sheet_name='rdkit', index=False)

In [32]:
from rdkit.Chem import PandasTools
# Set file path and format
file = '../dataset/formats/virtual_molecuke.sdf'
sdfInfo = dict(smilesName='CanonicalSMILES', molColName='ROMol')
moldf = PandasTools.LoadSDF(file, **sdfInfo)
print('Original data: ', moldf.shape)
# Rename ROMol
moldf = moldf.rename(columns={'ROMol': 'Mol'})
# Remove missing RDKit molecules
moldf = moldf[pd.notnull(moldf['Mol'])]
if 'StandardizerResult' in moldf.columns:
    moldf = moldf.drop(columns='StandardizerResult')
# Columns
print('Kept data: ', moldf.shape)
moldf.head(1)

Original data:  (101097, 12)
Kept data:  (101097, 12)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7f0f37871870>


In [33]:
# Columns
print('Kept data: ', moldf.shape)
moldf.head(1)
from molvs.validate import Validator
fmt = '%(asctime)s - %(levelname)s - %(validation)s - %(message)s'
validator = Validator(log_format=fmt)
print('\n Problematic structures: \n', validator.validate(moldf))

Kept data:  (101097, 12)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7f0f37871870>



 Problematic structures: 
 []


In [43]:
X_vs = pd.read_csv('../descriptors/generate/rdkit/virtual_screening_rdkit_ml_descriptors.csv', sep=',')

In [44]:
print(X_vs.shape)
X_vs.head()

(101097, 209)


Unnamed: 0.1,Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0,12.300769,0.000678,12.300769,0.000678,0.485441,389.886,369.726,389.12949,140,...,0,0,0,0,0,0,0,0,0,0
1,1,11.976227,0.087264,11.976227,0.087264,0.703084,326.827,307.675,326.118591,118,...,0,0,0,0,0,0,0,0,0,0
2,2,12.710492,-0.550667,12.710492,0.067372,0.358975,449.986,421.762,449.198238,166,...,0,0,0,0,0,0,0,0,0,0
3,3,13.659796,-0.389916,13.659796,0.127136,0.48172,447.97,421.762,447.182588,164,...,0,0,0,0,0,0,0,0,0,0
4,4,11.900016,0.114647,11.900016,0.114647,0.521809,381.907,357.715,381.16079,140,...,0,0,0,0,0,0,0,0,3,0


In [45]:
train_desc = pd.read_csv('../descriptors/generate/rdkit/processed/rdkit-chembl-alzheimer-acetilcolinesterase-processed.txt', sep='\t')
desc_list = train_desc.columns.tolist()
print(train_desc.shape)
train_desc.head()

(4829, 207)


Unnamed: 0,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BCUT2D_MWHI,BCUT2D_MWLOW,BalabanJ,BertzCT,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,qed
0,2.107397,-2.070895,2.174202,-2.129402,5.379318,0.238341,16.612224,10.215628,1.782327,823.864699,...,0,0,0,0,0,0,0,0,0,0.69996
1,2.2649,-2.258138,2.348823,-2.388739,7.987253,0.182183,35.495692,10.140341,1.615899,750.9366,...,0,0,0,0,0,0,0,0,0,0.734047
2,2.300456,-2.169923,2.28683,-2.319987,7.987698,-0.104742,35.495692,10.197185,1.966831,961.212101,...,0,0,0,0,0,0,0,0,0,0.527034
3,2.304634,-2.258365,2.285829,-2.389328,7.987688,-0.104745,35.495692,10.140341,1.909572,772.792312,...,0,0,0,0,0,0,0,0,0,0.695219
4,2.306183,-2.095377,2.259567,-2.282378,7.980216,-0.274187,32.166427,10.184613,2.301423,698.235461,...,0,0,0,0,0,0,0,0,0,0.799502


## Filter out descriptors not present in the model

In [46]:
miss_desc = train_desc.columns.difference(X_vs.columns).tolist()
miss_desc = pd.DataFrame([[0]*len(miss_desc)]*X_vs.shape[0], columns=miss_desc)
X_vs = pd.concat([X_vs, miss_desc], axis=1)
X_vs = X_vs[desc_list]
X_vs.shape
X_vs.fillna(0, inplace=True)

(101097, 207)

In [47]:
X_vs.index = range(len(X_vs))

In [48]:
X_vs.head()

Unnamed: 0,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BCUT2D_MWHI,BCUT2D_MWLOW,BalabanJ,BertzCT,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,qed
0,2.106161,-2.053889,2.238982,-2.117261,6.300904,-0.115697,35.495692,10.086945,1.507095,1088.00349,...,0,0,0,0,0,0,0,0,0,0.485441
1,2.060557,-2.052992,2.183949,-2.253456,6.300861,-0.12052,35.495691,10.085211,1.451843,792.494645,...,0,0,0,0,0,0,0,0,0,0.703084
2,2.238849,-2.32353,2.242699,-2.496245,6.308223,-0.122903,35.495692,10.014253,1.126636,1237.584552,...,0,0,0,0,0,0,0,0,0,0.358975
3,2.277923,-2.347791,2.267712,-2.518814,6.309055,-0.133101,35.495692,10.02287,1.48535,1261.660393,...,0,0,0,0,0,0,0,0,0,0.48172
4,2.032308,-2.098052,2.237666,-2.245505,6.308885,-0.120562,35.495692,10.107341,1.332698,883.120102,...,0,0,0,0,0,0,0,3,0,0.521809


# Load the models

In [49]:
model_rf = joblib.load('../models/pkl/cov_rf_rdkit.pkl')

In [50]:
model_svm = joblib.load('../models/pkl/cov_svm_rdkit.pkl')

In [51]:
model_mlp = joblib.load('../models/pkl/cov_mlp_rdkit.pkl')

In [52]:
model_tf = load_model('../models/sequential_h5/rdkit_melhor_modelo.h5')



# Load the scaler

In [53]:
scaler = joblib.load('../models/pkl/logBB_scale_rdkit.pkl')

In [54]:
X_vs = X_vs.replace([np.inf, -np.inf], np.nan)  # Substitui infinitos por NaN
X_vs = X_vs.dropna()  # Remove linhas com NaN
X_vs = scaler.fit_transform(X_vs)

In [55]:
X_vs.shape

(101096, 207)

In [56]:
X_vs = np.nan_to_num(X_vs)

In [57]:
X_vs_df = pd.DataFrame(X_vs)

In [58]:
X_vs.shape

(101096, 207)

In [59]:
X_vs_df.to_csv('../descriptors/generate/rdkit/virtual_screening_rdkit_x.csv', index=False)

In [60]:
X_vs = pd.read_csv("../descriptors/generate/rdkit/virtual_screening_rdkit_x.csv", delimiter=",", error_bad_lines=False)

# Predict molecules - Random forest

In [61]:
%%time
pred_rf = predictions(model_rf, X_vs)

CPU times: user 7.33 s, sys: 27 ms, total: 7.36 s
Wall time: 7.36 s


In [62]:
status_predictions(pred_rf)

VS pred: Counter({0: 52886, 1: 48210})
VS pred AD: Counter({0: 11385, 1: 5798})
Coverage of AD: 17.00%


### Visualize predictions - Random forest

In [63]:
predictions_rf = visualize_predictions(pred_rf)
predictions_rf.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,AD,Confidence
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598e10>,Active,,0.55
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598f30>,Inactive,,0.57
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598f90>,Active,,0.6
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c85b80f0>,Inactive,,0.6
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c85b8150>,Active,,0.68


### Export SDF and Excel - Random forest

In [64]:
export_predictions(predictions_rf, 'rf')

# Predict molecules - SVM

In [65]:
%%time
pred_svm = predictions(model_svm, X_vs)

CPU times: user 1min 17s, sys: 33 ms, total: 1min 17s
Wall time: 1min 17s


In [66]:
status_predictions(pred_svm)

VS pred: Counter({0: 59356, 1: 41740})
VS pred AD: Counter({0: 29676, 1: 26553})
Coverage of AD: 55.62%


### Visualize predictions - SVM

In [67]:
predictions_svm = visualize_predictions(pred_svm)
predictions_svm.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,AD,Confidence
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598e10>,Active,Active,0.79
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598f30>,Inactive,,0.55
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598f90>,Active,,0.53
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c85b80f0>,Active,,0.6
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c85b8150>,Active,Active,0.91


### Export SDF and Excel - SVM

In [68]:
export_predictions(predictions_svm,'svm')

# Predict molecules - MLP

In [69]:
%%time
pred_mlp = predictions(model_mlp, X_vs)

CPU times: user 2.87 s, sys: 2.92 s, total: 5.79 s
Wall time: 1.01 s


In [70]:
status_predictions(pred_mlp)

VS pred: Counter({1: 54195, 0: 46901})
VS pred AD: Counter({1: 48106, 0: 40952})
Coverage of AD: 88.09%


### Visualize predictions - MLP

In [71]:
predictions_mlp = visualize_predictions(pred_mlp)
predictions_mlp.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,AD,Confidence
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598e10>,Active,Active,0.95
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598f30>,Active,Active,0.76
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598f90>,Active,Active,0.97
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c85b80f0>,Inactive,Inactive,0.71
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c85b8150>,Active,Active,1.0


### Export SDF and Excel - MLP

In [72]:
export_predictions(predictions_mlp, 'mlp')

# Predict molecules - TF

In [73]:
%%time
pred_tf = predictions_tf(model_tf, X_vs)

CPU times: user 6.97 s, sys: 387 ms, total: 7.36 s
Wall time: 4.85 s


In [None]:
status_predictions(pred_tf)

In [74]:
predictions_tf = visualize_predictions(pred_tf)
predictions_tf.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,ID,Mol,Prediction,Confidence,AD
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598e10>,Active,0.97,Active
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598f30>,Inactive,0.44,
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c8598f90>,Active,0.99,Active
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c85b80f0>,Active,0.97,Active
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,,<rdkit.Chem.rdchem.Mol object at 0x7f63c85b8150>,Active,1.0,Active


In [75]:
export_predictions(predictions_tf, 'tf')

# Consensus

In [2]:
import pandas as pd
predictions_mlp = pd.read_excel("../dataset/screened/pubchem_hits_qsar_mlp_rdkit.xlsx")
predictions_rf = pd.read_excel("../dataset/screened/pubchem_hits_qsar_rf_rdkit.xlsx")
predictions_svm = pd.read_excel("../dataset/screened/pubchem_hits_qsar_svm_rdkit.xlsx")
predictions_tf = pd.read_excel("../dataset/screened/pubchem_hits_qsar_tf_rdkit.xlsx")

In [3]:
predictions_rf.rename(columns={'Prediction': 'rf', 'AD': 'rf_ad', 'Confidence': 'rf_score'}, inplace=True)
predictions_svm.rename(columns={'Prediction': 'svm', 'AD': 'svm_ad', 'Confidence': 'svm_score'}, inplace=True)
predictions_mlp.rename(columns={'Prediction': 'mlp', 'AD': 'mlp_ad', 'Confidence': 'mlp_score'}, inplace=True)
predictions_tf.rename(columns={'Prediction': 'tf', 'AD': 'tf_ad', 'Confidence': 'tf_score'}, inplace=True)

In [4]:
predictions = pd.merge(predictions_rf, predictions_svm[['CID', 'svm', 'svm_ad', 'svm_score']], how='inner', on='CID')
predictions = pd.merge(predictions, predictions_mlp[['CID', 'mlp', 'mlp_ad', 'mlp_score']], how='inner', on='CID')
predictions = pd.merge(predictions, predictions_tf[['CID', 'tf', 'tf_ad', 'tf_score']], how='inner', on='CID')

In [5]:
for col in ['rf', 'rf_ad', 'svm', 'svm_ad', 'mlp', 'mlp_ad', 'tf', 'tf_ad']:
    predictions[col].replace('Inactive',0,inplace=True)
    predictions[col].replace('Active',1,inplace=True)
predictions.head()

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,rf_score,svm,svm_ad,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.55,1.0,1.0,0.79,1.0,1.0,0.95,1.0,1.0,0.97
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,0.57,0.0,,0.55,1.0,1.0,0.76,0.0,,0.44
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.6,1.0,,0.53,1.0,1.0,0.97,1.0,1.0,0.99
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.6,1.0,,0.6,0.0,0.0,0.71,1.0,1.0,0.97
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.68,1.0,1.0,0.91,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
import numpy as np
# Consensus
predictions['consensus'] = (predictions.rf + predictions.svm + predictions.mlp + predictions.tf)/4
predictions['consensus'] = np.where(predictions['consensus'] > 0.5, 1, np.where(predictions['consensus'] == 0.5, np.nan, 0))

In [8]:
# Consensus AD
# Crie uma função para calcular a média dos valores não nulos em cada linha
def calculate_consensus(row):
    values = row[['rf_ad', 'svm_ad', 'mlp_ad', 'tf_ad']].dropna()
    if len(values) == 0:
        return np.nan
    return values.mean()

# Aplique a função a cada linha do DataFrame
predictions['consensus_ad'] = predictions.apply(calculate_consensus, axis=1)

# Arredonde os valores para 0 ou 1 com base no limite de 0,5
predictions['consensus_ad'] = np.where(predictions['consensus_ad'] > 0.5, 1, np.where(predictions['consensus_ad'] == 0.5, np.nan, 0))

In [9]:
predictions[predictions['consensus'] == 1]

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_ad,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad
0,6,16723801,1,2,389.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,1.0,0.79,1.0,1.0,0.95,1.0,1.0,0.97,1.0,1.0
2,39,145950271,4,3,450.00,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,,0.53,1.0,1.0,0.97,1.0,1.0,0.99,1.0,1.0
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,1.0,0.91,1.0,1.0,1.00,1.0,1.0,1.00,1.0,1.0
5,56,42743569,2,3,367.90,4.4,O=C(Cc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,2,1,...,1.0,0.81,1.0,1.0,1.00,1.0,1.0,1.00,1.0,1.0
6,60,9978527,1,2,403.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)NCc1cccnc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,10,1,...,1.0,0.77,1.0,1.0,0.87,1.0,1.0,0.97,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101077,158532,71643898,2,4,247.36,3.0,NC1CCC(Nc2nsc3ccccc23)CC1,C1CC(CCC1N)NC2=NSC3=CC=CC=C32,1,1,...,1.0,0.72,1.0,1.0,0.95,0.0,,0.47,1.0,1.0
101084,158546,71644884,1,4,247.36,2.6,NCC1CCN(c2nsc3ccccc23)CC1,C1CN(CCC1CN)C2=NSC3=CC=CC=C32,1,1,...,,0.52,1.0,1.0,0.99,1.0,1.0,0.86,1.0,1.0
101085,158550,72699444,1,5,402.60,4.4,CC(=S)NC1CCC(CCN2CCN(c3nsc4ccccc34)CC2)CC1,CC(=S)NC1CCC(CC1)CCN2CCN(CC2)C3=NSC4=CC=CC=C43,1,1,...,,0.53,1.0,1.0,1.00,1.0,1.0,0.89,1.0,1.0
101086,158551,72699874,2,5,417.60,4.2,CNC(=S)NC1CCC(CCN2CCN(c3nsc4ccccc34)CC2)CC1,CNC(=S)NC1CCC(CC1)CCN2CCN(CC2)C3=NSC4=CC=CC=C43,1,1,...,,0.56,1.0,1.0,0.99,1.0,1.0,0.86,1.0,1.0


In [10]:
predictions.head(100)

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_ad,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,1.0,0.79,1.0,1.0,0.95,1.0,1.0,0.97,1.0,1.0
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,,0.55,1.0,1.0,0.76,0.0,,0.44,0.0,1.0
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,,0.53,1.0,1.0,0.97,1.0,1.0,0.99,1.0,1.0
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,,0.60,0.0,0.0,0.71,1.0,1.0,0.97,,
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,1.0,0.91,1.0,1.0,1.00,1.0,1.0,1.00,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,304,113087103,1,1,385.3,4.6,O=C(Cc1ccc(Cl)cc1)N1CC=C(c2c[nH]c3cc(Cl)ccc23)CC1,C1CN(CC=C1C2=CNC3=C2C=CC(=C3)Cl)C(=O)CC4=CC=C(...,7,1,...,0.0,0.74,0.0,0.0,0.98,0.0,,0.07,0.0,0.0
96,305,113087105,1,1,364.9,4.2,O=C(CCc1ccccc1)N1CC=C(c2c[nH]c3cc(Cl)ccc23)CC1,C1CN(CC=C1C2=CNC3=C2C=CC(=C3)Cl)C(=O)CCC4=CC=C...,10,1,...,,0.56,0.0,0.0,0.79,0.0,,0.34,0.0,0.0
97,306,113087681,1,1,344.9,4.8,O=C(C1CCCCC1)N1CCC(c2c[nH]c3cc(Cl)ccc23)CC1,C1CCC(CC1)C(=O)N2CCC(CC2)C3=CNC4=C3C=CC(=C4)Cl,10,1,...,1.0,0.85,1.0,,0.52,1.0,1.0,0.94,1.0,1.0
98,307,113087702,1,1,352.9,4.3,O=C(Cc1ccccc1)N1CCC(c2c[nH]c3cc(Cl)ccc23)CC1,C1CN(CCC1C2=CNC3=C2C=CC(=C3)Cl)C(=O)CC4=CC=CC=C4,11,1,...,1.0,0.71,1.0,1.0,0.86,0.0,,0.50,1.0,1.0


In [11]:
predictions['count']=pd.concat([predictions['consensus'],predictions['consensus_ad']],axis=1).sum(axis=1)
predictions

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
0,6,16723801,1,2,389.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.79,1.0,1.0,0.95,1.0,1.0,0.97,1.0,1.0,2.0
1,14,4782931,2,1,326.80,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,0.55,1.0,1.0,0.76,0.0,,0.44,0.0,1.0,1.0
2,39,145950271,4,3,450.00,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.53,1.0,1.0,0.97,1.0,1.0,0.99,1.0,1.0,2.0
3,46,118705964,2,4,448.00,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.60,0.0,0.0,0.71,1.0,1.0,0.97,,,0.0
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.91,1.0,1.0,1.00,1.0,1.0,1.00,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101092,158588,140116108,1,4,261.39,3.5,c1ccc2c(NCCN3CCCCC3)nsc2c1,C1CCN(CC1)CCNC2=NSC3=CC=CC=C32,1,1,...,0.92,0.0,0.0,1.00,0.0,,0.22,0.0,0.0,0.0
101093,158589,140732880,0,5,232.33,2.6,[CH2-]N1CCN(c2nsc3ccccc23)CC1,[CH2-]N1CCN(CC1)C2=NSC3=CC=CC=C32,1,1,...,0.99,1.0,1.0,0.99,0.0,,0.42,0.0,0.0,0.0
101094,158590,143285344,2,4,221.32,3.1,CNCCCNc1nsc2ccccc12,CNCCCNC1=NSC2=CC=CC=C21,1,1,...,0.71,1.0,1.0,1.00,1.0,1.0,0.91,1.0,1.0,2.0
101095,158591,144116719,1,5,346.50,4.0,CCC(CN)C(CC)CN1CCN(c2nsc3ccccc23)CC1,CC[C@@H](CN1CCN(CC1)C2=NSC3=CC=CC=C32)C(CC)CN,1,1,...,0.81,0.0,0.0,1.00,0.0,,0.24,0.0,0.0,0.0


In [12]:
# selecting rows based on condition 
hits = predictions[predictions['count'] == 2.0] 
print(hits.shape)
hits

(34946, 26)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
0,6,16723801,1,2,389.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.79,1.0,1.0,0.95,1.0,1.0,0.97,1.0,1.0,2.0
2,39,145950271,4,3,450.00,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.53,1.0,1.0,0.97,1.0,1.0,0.99,1.0,1.0,2.0
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.91,1.0,1.0,1.00,1.0,1.0,1.00,1.0,1.0,2.0
5,56,42743569,2,3,367.90,4.4,O=C(Cc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,2,1,...,0.81,1.0,1.0,1.00,1.0,1.0,1.00,1.0,1.0,2.0
6,60,9978527,1,2,403.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)NCc1cccnc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,10,1,...,0.77,1.0,1.0,0.87,1.0,1.0,0.97,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101077,158532,71643898,2,4,247.36,3.0,NC1CCC(Nc2nsc3ccccc23)CC1,C1CC(CCC1N)NC2=NSC3=CC=CC=C32,1,1,...,0.72,1.0,1.0,0.95,0.0,,0.47,1.0,1.0,2.0
101084,158546,71644884,1,4,247.36,2.6,NCC1CCN(c2nsc3ccccc23)CC1,C1CN(CCC1CN)C2=NSC3=CC=CC=C32,1,1,...,0.52,1.0,1.0,0.99,1.0,1.0,0.86,1.0,1.0,2.0
101085,158550,72699444,1,5,402.60,4.4,CC(=S)NC1CCC(CCN2CCN(c3nsc4ccccc34)CC2)CC1,CC(=S)NC1CCC(CC1)CCN2CCN(CC2)C3=NSC4=CC=CC=C43,1,1,...,0.53,1.0,1.0,1.00,1.0,1.0,0.89,1.0,1.0,2.0
101086,158551,72699874,2,5,417.60,4.2,CNC(=S)NC1CCC(CCN2CCN(c3nsc4ccccc34)CC2)CC1,CNC(=S)NC1CCC(CC1)CCN2CCN(CC2)C3=NSC4=CC=CC=C43,1,1,...,0.56,1.0,1.0,0.99,1.0,1.0,0.86,1.0,1.0,2.0


In [13]:
with pd.ExcelWriter('../dataset/screened/pubchem_hits_qsar_rdkit_consensus.xlsx') as writer:
    predictions.to_excel(writer, sheet_name='consensus', index=False)