In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from tqdm import tqdm
from IPython.display import SVG

# RDKit for cheminformatics
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, Descriptors, Lipinski, PandasTools
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import DataStructs
from rdkit.Avalon.pyAvalonTools import GetAvalonFP

# ChEMBL web resource client
from chembl_webresource_client.new_client import new_client 

# Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, ElasticNet)
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor)
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import (r2_score, mean_absolute_error, mean_squared_error)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

# SHAP for model interpretation
import shap

# Padelpy for molecular descriptor calculations
from padelpy import padeldescriptor, from_smiles

# PubChem for chemical information retrieval
import pubchempy as pcp

# Additional libraries for data handling and visualization
import requests
import json
import math
import time  # To check time
from sklearn.model_selection import LeaveOneOut, cross_val_score


# Hide warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set display options for Pandas DataFrames
pd.set_option('display.max_columns', 1800)
pd.set_option('display.max_colwidth', None)  # None means no limit


In [2]:
#reloading my data under a new name which will be df2
df= pd.read_csv('ml_data.csv')
df = df.drop(df.columns[0], axis =1)
df.head()


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class,pIC50,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nN,nO,nS,nF,nCl,nBr,nX,ATS0m,ATS0s,AATS0m,AATS1m,AATS0e,AATS6e,AATS8e,AATS0p,AATS0i,AATS1i,AATS3i,AATS4i,AATS6i,AATS8s,ATSC0c,ATSC1c,ATSC2c,ATSC3c,ATSC4c,ATSC5c,ATSC6c,ATSC7c,ATSC8c,ATSC1m,ATSC2m,ATSC3m,ATSC4m,ATSC5m,ATSC6m,ATSC7m,ATSC8m,ATSC1v,ATSC2v,ATSC3v,ATSC4v,ATSC5v,ATSC6v,ATSC7v,ATSC8v,ATSC1e,ATSC2e,ATSC3e,ATSC4e,ATSC5e,ATSC6e,ATSC7e,ATSC8e,ATSC1p,ATSC5p,ATSC7p,ATSC8p,ATSC1i,ATSC2i,ATSC3i,ATSC4i,ATSC5i,ATSC6i,ATSC7i,ATSC8i,ATSC1s,ATSC2s,ATSC3s,ATSC5s,ATSC6s,ATSC8s,AATSC0c,AATSC1c,AATSC2c,AATSC3c,AATSC4c,AATSC6c,AATSC7c,AATSC4m,AATSC6m,AATSC7m,AATSC8m,AATSC3v,AATSC4v,AATSC7v,AATSC4e,AATSC8e,AATSC0p,AATSC3p,AATSC4p,AATSC7p,AATSC0i,AATSC3i,AATSC4i,AATSC7i,AATSC8i,AATSC1s,AATSC3s,AATSC6s,AATSC8s,MATS1c,MATS3m,MATS1s,MATS2s,GATS2c,GATS3c,GATS4c,GATS5c,GATS6c,GATS7c,GATS8c,GATS2m,GATS3m,GATS4m,GATS5m,GATS6m,GATS7m,GATS8m,GATS2v,GATS3v,GATS7v,GATS8v,GATS1e,GATS2e,GATS3e,GATS4e,GATS5e,GATS6e,GATS7e,GATS2p,GATS3p,GATS5p,GATS6p,GATS1i,GATS2i,GATS3i,GATS5i,GATS6i,GATS8i,GATS1s,GATS2s,GATS3s,GATS4s,GATS5s,GATS6s,GATS7s,GATS8s,SM1_DzZ,VE1_DzZ,VE3_DzZ,VR1_DzZ,VE1_Dzv,VE2_Dzv,VE3_Dzv,VR1_Dzv,VE1_Dzp,VE3_Dzp,VR1_Dzp,VR1_Dzi,VE1_Dzs,VR1_Dzs,nBase,BCUTw-1l,BCUTc-1l,BCUTc-1h,BCUTp-1l,BCUTp-1h,nBondsD,nBondsT,SpMax2_Bhm,SpMax3_Bhm,SpMin1_Bhm,SpMin2_Bhm,SpMin3_Bhm,SpMin6_Bhm,SpMax1_Bhv,SpMin1_Bhv,SpMin8_Bhe,SpMax1_Bhs,SpMax2_Bhs,SpMax3_Bhs,SpMax4_Bhs,C1SP2,C2SP2,C3SP2,C1SP3,C2SP3,C3SP3,SCH-3,SCH-5,SCH-6,VCH-5,VCH-6,SC-4,SC-5,VC-3,VC-5,ASP-0,AVP-0,VE1_Dt,VE2_Dt,VE3_Dt,VR1_Dt,VR3_Dt,nHBd,nHBa,nHBint2,nHBint3,nHBint4,nHBint5,nHBint6,nHBint7,nHBint8,nHBint9,nHBint10,nHsOH,nHdNH,nHssNH,nHaaNH,nHdsCH,nHCsats,nHCsatu,nHAvin,nsCH3,ndssC,naasC,naaaC,ndsN,naaN,nsssN,naasN,ndO,nssO,nsOm,SHBint3,SHBint4,SHBint6,SHBint8,SHBint10,SdssC,SaasC,SaaaC,SssssC,minHBd,minHBa,minwHBa,minHBint2,minHBint3,minHBint4,minHBint5,minHBint6,minHBint7,minHBint8,minHBint10,minHsOH,minHdsCH,minHCsats,minsCH3,minaaCH,minaasC,minaaN,minsssN,minsF,maxHBd,maxHBa,maxHBint4,maxHaaCH,maxHother,maxdssC,maxaaN,hmax,hmin,gmin,ETA_dAlpha_A,ETA_dEpsilon_B,ETA_dEpsilon_D,ETA_Shape_P,ETA_Shape_Y,ETA_BetaP_s,ETA_Beta_ns_d,ETA_EtaP_F,ETA_Eta_B,ETA_EtaP_B,ETA_EtaP_B_RC,FMF,nHBAcc,IC1,IC2,IC3,SIC0,SIC1,SIC2,SIC3,MIC0,MIC1,Kier3,nAtomLC,nAtomP,nAtomLAC,MDEC-11,MDEO-11,MDEO-22,MDEN-22,MDEN-23,MDEN-33,MLFER_A,MLFER_BH,MLFER_S,R_TpiPCTPC,PetitjeanNumber,n6Ring,nHeteroRing,n6HeteroRing,RotBFrac,nRotBt,RotBtFrac,LipinskiFailures,GGI10,JGI1,JGI2,JGI3,JGI4,JGI5,JGI6,JGI7,JGI8,JGI9,JGI10,VE1_D,VR3_D,TopoPSA,XLogP,PubchemFP2,PubchemFP3,PubchemFP12,PubchemFP13,PubchemFP16,PubchemFP19,PubchemFP20,PubchemFP24,PubchemFP38,PubchemFP179,PubchemFP185,PubchemFP192,PubchemFP193,PubchemFP199,PubchemFP206,PubchemFP207,PubchemFP258,PubchemFP259,PubchemFP260,PubchemFP261,PubchemFP299,PubchemFP338,PubchemFP340,PubchemFP357,PubchemFP375,PubchemFP377,PubchemFP378,PubchemFP379,PubchemFP391,PubchemFP392,PubchemFP401,PubchemFP431,PubchemFP476,PubchemFP485,PubchemFP494,PubchemFP510,PubchemFP519,PubchemFP526,PubchemFP528,PubchemFP542,PubchemFP548,PubchemFP566,PubchemFP575,PubchemFP580,PubchemFP593,PubchemFP644,PubchemFP652,PubchemFP679,PubchemFP681,PubchemFP688,PubchemFP691,PubchemFP699,PubchemFP713,PubchemFP714,PubchemFP716,PubchemFP717,PubchemFP718,PubchemFP721,PubchemFP729,PubchemFP737,PubchemFP738,PubchemFP739,PubchemFP742,PubchemFP750,PubchemFP752,PubchemFP755,PubchemFP759
0,CHEMBL1170766,N#Cc1ccc(Nc2ncnc3c(=N)n(NC(=O)c4ccncc4)cnc23)cc1,17.2,active,7.764472,0,-1.4971,2.241308,25.6387,52.810309,22,9,1,0,0.0,0.0,0.0,0.0,4775.959573,243.972222,113.713323,116.429357,7.994673,7.949217,7.962848,1.673864,164.277002,152.122722,160.081051,164.443902,161.084591,4.228482,0.83081,-0.433699,-0.033613,0.256874,-0.460764,0.275811,0.16868,-0.334968,0.228552,26.333202,-136.475554,-97.376348,-249.195233,102.570119,-283.805452,-146.715397,28.025075,31.133953,-211.315642,-526.380834,-549.930239,606.001226,-170.476674,-247.786358,136.729687,-0.336317,0.172403,0.69157,-1.077764,0.098845,-0.068546,-0.871449,0.191053,-0.185457,3.934479,-1.858377,0.954098,-13.081999,1.972734,-7.005453,-46.269382,34.812113,18.417957,-34.067415,4.221445,-6.065051,-3.47038,3.719923,4.027038,-3.773762,1.711341,0.019781,-0.009638,-0.000487,0.002919,-0.005063,0.002249,-0.004294,-2.738409,-3.784073,-1.880967,0.394719,-5.9816,-6.043189,-3.176748,-0.011844,0.002691,0.193568,-0.029797,-0.038445,-0.023825,1.926685,-0.079607,-0.508455,-0.436762,0.059457,-0.134779,0.042272,-0.050317,0.024103,-0.487218,-0.036408,-0.082567,-0.030812,0.967391,0.883387,1.405782,0.993952,0.879381,1.153605,0.890737,0.802872,0.917194,1.093301,1.028988,1.041172,1.038656,1.055002,0.886759,1.044661,1.07958,0.957364,0.859819,0.80555,0.837334,1.297605,1.250112,0.941836,1.017313,0.94266,1.087775,0.709303,0.946303,1.258908,1.07624,1.063848,0.760041,0.879314,0.933101,0.640848,0.602601,0.783116,1.284497,1.202306,0.794046,0.888308,1.005484,1.535714,0.023475,-10.880223,1040.028048,0.108008,0.003724,-6.454095,545.855907,0.162261,-5.273798,2735.224633,2679.596347,0.048337,832.336468,0,11.85,-0.290322,0.221955,4.231617,10.093648,2.0,1.0,3.82926,3.791551,2.014496,1.930156,1.878116,1.106439,3.945515,2.135571,0.79862,4.524144,4.435065,4.223637,3.799122,5,9,2,0,0,0,0.0,0.0,0.308992,0.0,0.08149,0.0,0.416241,0.589188,0.070648,0.697137,0.512587,0.013116,0.000452,-12.568374,959.779275,19.91344,3.0,5.0,2.0,2.0,0.0,4.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,0.0,4.0,0.0,1.0,1.0,0.0,0.0,6.240203,0.0,3.019885,5.725561,0.0,-0.405521,2.054972,0.629577,0.0,0.464588,1.184114,-0.405521,0.612257,1.338655,0.0,0.550126,1.202809,0.0,5.725561,0.0,0.0,0.0,0.0,0.0,1.308592,0.400793,3.871054,0.0,0.0,0.585846,12.323942,0.0,0.803039,0.803039,0.0,4.294611,0.803039,0.145993,-0.405521,0.0,0.12672,0.05162,0.08437,0.32754,0.68966,1.5,1.2588,0.26185,0.00903,0.02089,0.595238,10,3.898768,4.784942,4.963746,0.305248,0.723023,0.887363,0.920522,15.462381,40.102169,5.437518,3,29,0,0.0,0.0,0.0,3.276581,1.930979,0.0,0.087,2.201,3.978,11.129857,0.470588,4.0,3.0,3.0,0.15625,5.0,0.15625,0,0.152637,0.15625,0.075758,0.037234,0.04271,0.023504,0.01529,0.013938,0.010112,0.007005,0.007632,0.032689,18.868816,145.26,1.964,0,0,1,0,1,0,0,0,0,1,1,1,0,1,0,0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,CHEMBL1171167,N=c1c2ncnc(Nc3ccccc3)c2ncn1NC(=O)c1ccncc1,4.3,active,8.366532,0,-1.5318,2.346411,19.4401,50.617102,22,8,1,0,0.0,0.0,0.0,0.0,4436.515467,203.944444,108.207694,112.248317,7.920795,7.90767,7.899307,1.628,164.549203,152.458917,161.349563,164.732351,161.192264,3.837359,0.740514,-0.410953,-0.014632,0.242962,-0.435591,0.299088,0.074059,-0.269942,0.2059,41.989097,-139.14685,21.439971,-51.53518,113.34377,-232.533159,-216.501111,-159.023565,23.962633,-296.804175,-401.960743,-375.86213,567.315161,-97.520966,-348.69404,-111.209963,-0.277309,0.264034,0.863457,-0.737574,0.310263,-0.160452,-0.836913,0.03972,-0.260816,3.770536,-2.195496,-0.014563,-13.442488,-1.903169,-4.51548,-43.733934,38.086083,13.861736,-32.167256,0.522781,-7.350089,-2.627768,5.931588,14.066396,-3.760873,-1.812215,0.018061,-0.00934,-0.000215,0.002825,-0.005007,0.001001,-0.003552,-0.592358,-3.14234,-2.848699,-2.304689,-4.673962,-4.320254,-4.588079,-0.008478,0.000576,0.199852,-0.025614,-0.034226,-0.028888,1.859519,-0.052506,-0.502689,-0.423253,0.007577,-0.167047,0.068972,-0.050823,-0.026264,-0.517118,0.007837,-0.129209,-0.02989,1.006414,0.929904,1.492196,1.032537,0.974071,1.170156,0.917422,0.831655,0.89479,1.009696,0.985453,1.007666,1.040749,1.126235,0.926875,1.01158,1.081021,1.019369,0.861434,0.81203,0.833648,1.276275,1.243663,0.968321,1.021326,0.986482,1.058988,0.686782,0.928381,1.301925,1.131673,1.061852,0.744443,0.922837,0.957638,0.722702,0.680638,0.862963,1.317213,1.269588,0.936902,0.946585,1.175677,1.392857,0.062005,-7.507474,400.787707,0.147751,0.005472,-5.163021,440.37654,0.188976,-4.498567,360.077657,379.403933,0.07825,456.119593,0,11.85,-0.290074,0.222117,5.208489,10.082411,2.0,0.0,3.804235,3.76129,2.013855,1.925116,1.877738,1.098191,3.944192,2.135373,0.79862,4.547999,4.236003,3.85843,3.803802,5,10,1,0,0,0,0.0,0.0,0.327721,0.0,0.085788,0.0,0.416241,0.505854,0.070648,0.690356,0.51834,0.09059,0.003355,-6.483817,562.98802,17.099798,3.0,5.0,2.0,2.0,0.0,4.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,2.0,0.0,4.0,0.0,1.0,1.0,0.0,0.0,6.215656,0.0,3.026312,5.586459,0.0,-0.380796,1.742572,0.737706,0.0,0.454384,1.212691,-0.380796,0.627433,1.3488,0.0,0.551028,1.18455,0.0,5.586459,0.0,0.0,0.0,0.0,0.0,1.349557,0.422331,3.877281,0.0,0.0,0.582887,12.294566,0.0,0.79434,0.79434,0.0,4.31323,0.79434,0.14658,-0.380796,0.0,0.12199,0.05469,0.05851,0.31117,0.69444,1.5,1.20426,0.1937,0.00717,0.01991,0.609756,9,3.608644,4.567153,4.900116,0.306374,0.673562,0.85247,0.914618,15.321888,36.158182,4.950744,3,27,0,0.0,0.0,0.0,3.276581,1.930979,0.0,0.003,2.081,3.503,10.964829,0.466667,4.0,3.0,3.0,0.166667,5.0,0.166667,0,0.086624,0.133333,0.070461,0.039244,0.04,0.021584,0.013878,0.012533,0.00902,0.00625,0.005775,0.040772,15.733055,121.47,2.827,0,0,1,0,1,0,0,0,0,1,1,1,0,1,0,0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,CHEMBL1172293,N=c1c2ncnc(Nc3ccc(F)cc3)c2ncn1NC(=O)c1ccncc1,0.91,active,9.040959,0,-1.0521,1.106914,20.3126,50.507309,22,8,1,0,1.0,0.0,0.0,1.0,4796.438727,265.722222,116.98631,117.159288,8.147174,8.009236,8.076552,1.624722,167.442772,153.437631,163.560464,166.422857,162.476976,4.476651,0.817499,-0.443978,-0.004292,0.2331,-0.497244,0.423634,-0.005748,-0.23407,0.181564,24.322534,-120.503149,-204.663314,-292.181112,202.850152,-346.700731,-65.744629,67.571474,27.342535,-252.169204,-467.387167,-461.760591,590.265962,-134.517111,-295.081834,14.032859,-0.431266,-0.058711,-0.096759,-1.594274,0.677167,-0.595364,-0.330436,0.330599,-0.269953,3.760555,-2.231378,-0.131491,-15.938939,-10.596563,-9.365398,-44.931445,41.838669,12.213309,-30.663686,-7.326225,-10.000496,-2.503107,-8.212043,16.113821,-13.289328,2.125454,0.019939,-0.01009,-6.3e-05,0.00271,-0.005715,-7.8e-05,-0.00308,-3.358404,-4.685145,-0.865061,0.979297,-5.434734,-5.307593,-3.882656,-0.018325,0.004791,0.202968,-0.025254,-0.033587,-0.02936,2.364893,-0.1089,-0.516453,-0.40347,-0.106177,-0.227284,-0.095489,-0.179586,0.030804,-0.506065,-0.072722,-0.105752,-0.017127,0.97435,0.929651,1.438808,0.890287,0.985402,1.092639,0.938236,0.776717,0.956911,1.079979,0.931697,1.03012,0.96921,1.032393,0.909509,1.026658,1.088677,0.990911,0.747974,0.767418,0.966012,1.164739,0.841435,0.881862,0.798245,0.987761,1.05639,0.67938,0.922065,1.176108,1.087063,1.061042,0.604145,0.857014,0.969689,0.637633,0.640111,0.959265,1.15267,0.839369,0.837586,0.752912,1.022638,1.72619,0.02779,-10.032646,461.870315,0.110375,0.003942,-6.170831,3366.978245,0.119824,-5.940858,504.400246,521.935842,0.067541,450.388219,0,11.85,-0.290081,0.222138,4.021244,10.0817,2.0,0.0,3.817452,3.786778,2.011834,1.909317,1.875922,1.089195,3.944585,2.135412,0.79862,4.752091,4.577041,4.250338,3.816335,5,10,1,0,0,0,0.0,0.0,0.308992,0.0,0.08149,0.0,0.416241,0.568848,0.070648,0.696781,0.510564,0.048274,0.001724,-8.486394,567.010775,17.753059,3.0,6.0,2.0,2.0,0.0,5.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,0.0,4.0,0.0,1.0,1.0,0.0,0.0,6.258584,0.0,3.021333,5.926465,6.84403,-0.415957,1.008485,0.57257,0.0,0.482162,1.166437,-0.415957,0.610732,1.346402,0.0,0.562412,1.239877,0.0,5.926465,6.84403,0.0,0.0,0.0,0.0,1.280691,-0.351254,3.861282,0.0,13.071412,0.589832,13.071412,0.0,0.807759,0.807759,0.0,4.25671,0.807759,0.155839,-0.415957,0.0,0.1345,0.05627,0.07949,0.34324,0.69643,2.0,1.23467,0.29986,0.01071,0.023,0.609756,9,3.827575,4.735362,4.967308,0.330035,0.714426,0.883867,0.92716,17.800507,39.840091,5.2016,3,27,0,0.0,0.0,0.0,3.276581,1.930979,0.0,0.087,2.024,3.554,10.873427,0.5,4.0,3.0,3.0,0.16129,6.0,0.193548,0,0.115498,0.16129,0.077519,0.038889,0.043875,0.022478,0.015714,0.014348,0.010834,0.00753,0.006794,0.008686,16.984086,121.47,2.575,0,0,1,0,1,0,0,0,0,1,1,1,0,1,0,0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,CHEMBL1172698,COc1ccc(Nc2ncnc3c(=N)n(NC(=O)c4ccncc4)cnc23)cc1,1.59,active,8.798603,0,-1.5836,2.507789,26.7559,54.512688,22,8,2,0,0.0,0.0,0.0,0.0,4838.779717,220.972222,107.528438,111.405585,7.979594,7.872273,7.949302,1.579318,165.079962,152.523419,160.3296,163.000388,162.633891,3.847042,0.897577,-0.456967,-0.052326,0.240971,-0.499641,0.461474,-0.033102,-0.264946,0.212047,60.066081,4.348984,-146.548859,-408.065173,352.155257,2.366759,-221.191032,144.222042,-18.11502,135.974767,-485.546429,-828.603478,606.637109,190.156672,-317.591423,101.885632,-0.374268,-0.310847,0.289138,-1.17733,1.073593,-0.162776,-0.717305,0.470186,-0.764541,3.405404,-1.948556,0.18406,-17.654425,5.181372,-3.417128,-49.440408,33.688913,11.752432,-26.624317,-2.233952,-7.718107,-3.409465,3.29561,20.433471,-1.196502,3.823731,0.019946,-0.00952,-0.000698,0.002648,-0.005151,-0.000394,-0.003271,-4.206857,0.028176,-2.730753,1.873014,-5.335675,-8.542304,-3.920882,-0.012137,0.006106,0.202467,-0.024905,-0.047327,-0.024056,1.791496,-0.037551,-0.509695,-0.328695,-0.029012,-0.160794,0.036215,-0.014244,0.049659,-0.477292,-0.048748,-0.126381,-0.03573,1.103655,0.966973,1.35983,0.761977,0.961944,1.173193,0.94478,0.819184,0.913997,1.074214,0.926509,0.942405,1.007718,0.972352,0.816204,1.006903,1.040975,0.939941,0.88922,1.043274,0.937729,1.175507,0.885774,0.884034,0.94396,0.870297,1.057732,0.77232,0.898475,1.366869,1.06805,1.093664,0.77761,0.934121,0.98934,0.737385,0.733365,0.894375,1.297699,1.117564,0.892792,0.941539,1.088317,1.642857,0.021915,-11.079687,916.836399,0.049798,0.001717,-8.699377,583.667909,0.054798,-8.421915,643.940666,874.772613,0.050795,970.258865,0,11.85,-0.373952,0.222317,4.751614,10.091007,2.0,0.0,3.814054,3.784529,2.012223,1.913968,1.876708,1.123847,3.944564,2.135404,0.79862,4.564717,4.244466,3.887895,3.811031,5,10,1,0,0,0,0.0,0.0,0.308992,0.0,0.08149,0.0,0.416241,0.573896,0.070648,0.697137,0.528485,0.013116,0.000452,-12.568374,959.779275,19.91344,3.0,6.0,2.0,2.0,0.0,5.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4.0,2.0,0.0,4.0,0.0,1.0,1.0,1.0,0.0,6.232451,0.0,3.034503,5.651024,2.648825,-0.39103,2.375295,0.690989,0.0,0.458069,1.203627,-0.39103,0.618962,1.339975,0.0,0.551345,1.193592,0.0,5.651024,2.648825,0.0,0.0,0.0,1.599975,1.336247,0.413468,3.879939,0.0,0.0,0.583401,12.336618,0.0,0.799774,0.799774,0.0,4.322265,0.799774,0.142166,-0.39103,0.0,0.11549,0.0528,0.09227,0.32918,0.69828,2.0,1.22303,0.26185,0.00903,0.02089,0.555556,9,3.99221,4.819305,5.030633,0.309237,0.726933,0.877537,0.916017,16.233642,38.781709,5.437518,3,28,0,0.0,0.0,0.0,3.276581,1.930979,0.0,0.003,2.242,3.664,10.705857,0.470588,4.0,3.0,3.0,0.1875,7.0,0.21875,0,0.152637,0.15625,0.075758,0.037234,0.04271,0.023504,0.01529,0.013938,0.010112,0.007005,0.007632,0.032689,18.868816,130.7,2.172,1,0,1,0,1,1,0,0,0,1,1,1,0,1,0,0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,1,1,0,0,1,0,0,1,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,CHEMBL1777913,Cc1c(/C=N/NC(=O)c2ccncc2)[n+]([O-])c2ccccc2[n+]1[O-],1170.0,active,5.931814,0,-1.0133,1.026777,24.615,44.734309,16,5,3,0,0.0,0.0,0.0,0.0,4070.319016,248.916667,110.008622,111.834935,8.082476,7.919436,8.080058,1.577891,163.383604,150.767561,159.440176,164.917162,163.956435,4.891204,0.463859,-0.076533,-0.181805,-0.044022,0.032515,0.079023,-0.076523,0.064161,-0.025998,71.625734,105.532668,18.511929,-501.004986,-479.917381,-290.005416,560.843508,134.652466,-15.385567,-68.004093,-190.378615,-572.668262,-83.838547,-336.014425,388.851797,361.995024,0.671184,-0.160733,-0.627471,-1.330178,-0.688064,-0.073832,1.148636,-0.285783,0.106546,1.813045,0.486228,1.528871,-0.064711,-23.748714,-13.992948,-10.073751,19.001189,1.361171,-10.475063,4.178744,-1.423302,-4.082562,-1.484802,-16.577388,-12.882416,12.327997,0.012537,-0.001962,-0.002932,-0.000572,0.000401,-0.001343,0.001167,-6.185247,-5.087814,10.197155,2.80526,-2.47245,-7.069979,7.070033,-0.016422,-0.005954,0.208719,-0.029124,-0.029101,0.008841,1.702613,-0.181727,-0.124367,-0.190456,0.087057,-0.036495,-0.019283,-0.226007,0.256833,-0.156532,0.007144,-0.01488,-0.026848,1.200219,1.046031,1.079005,1.160732,1.139856,0.97048,1.160794,0.726389,0.86731,1.175755,1.273003,1.144033,0.758343,0.904132,0.926869,0.944925,0.909118,0.788714,0.45532,0.740023,1.039395,1.457932,1.457334,0.893,0.6948,1.063053,1.070679,0.798048,1.035316,1.168251,1.3183,1.16219,0.788229,0.969705,0.904956,0.497152,0.641534,0.915339,1.514849,1.557456,0.889878,0.715971,1.085771,1.464286,0.318765,-2.743924,216.98149,0.291203,0.012133,-2.960966,209.330673,0.26195,-3.215048,181.799757,219.144603,0.278086,413.057246,0,11.999,-0.278642,0.227275,4.96394,9.988204,2.0,0.0,3.805126,3.498875,2.075211,1.887386,1.665848,1.103697,3.98722,2.148625,0.79862,4.539291,4.442214,4.232576,3.80072,4,10,1,1,0,0,0.0,0.0,0.222432,0.0,0.083873,0.0,0.598787,0.617327,0.130846,0.71267,0.51721,0.096127,0.004005,-5.620992,178.917764,12.448623,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,-0.444869,0.620936,0.488872,0.0,0.65205,2.308607,-0.444869,7.746063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.780868,0.0,1.51377,1.484182,0.054378,3.816939,0.0,0.0,0.65205,11.879547,0.0,0.66024,0.780868,0.0,3.816939,0.780868,0.254627,-0.444869,0.0,0.11341,0.02317,0.13636,0.34545,0.65625,1.0,1.10601,0.32182,0.01341,0.02416,0.540541,4,3.600024,4.283381,4.64851,0.333487,0.691056,0.822232,0.892322,16.975058,35.742361,4.233236,5,23,0,0.0,0.458566,0.0,0.965489,1.136952,0.333333,0.003,0.573,3.487,10.271901,0.461538,3.0,2.0,2.0,0.153846,7.0,0.269231,0,0.090807,0.134615,0.092593,0.057813,0.047059,0.022703,0.017184,0.014385,0.012517,0.00878,0.005675,0.238922,12.540339,108.23,2.096,0,0,1,0,1,1,0,0,0,1,1,1,0,0,0,0,1,1,0,0,1,0,1,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [3]:
df.set_index('molecule_chembl_id', inplace=True)

In [4]:
#df = df.drop(columns=['canonical_smiles','standard_value', 'bioactivity_class'])
#df.head()

In [5]:
features = ['BCUTw-1l', 'GATS7c', 'AATS8s', 'GATS4c', 'GATS8i', 'GATS6p']

X = df[features]
y = df['pIC50']

# Scale the features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=df.index)


In [6]:
# # Scaffols chosen to make sure they are in the train data
# chosen_scaffolds = ['CHEMBL1087717', 'CHEMBL226983', 'CHEMBL3780092', 'CHEMBL3780593', 'CHEMBL196076',
#                     'CHEMBL197852', 'CHEMBL226069', 'CHEMBL3330937', 'CHEMBL64', 'CHEMBL1171167',
#                     'CHEMBL197852', 'CHEMBL227499', 'CHEMBL1172698', 'CHEMBL390333']  
# ## Get indices of chosen scaffolds from the DataFrame index
# chosen_indices = df.index[df.index.isin(chosen_scaffolds)].tolist()

# # Selecting chosen scaffolds from X_scaled and y
# X_chosen = X_scaled.loc[chosen_indices]
# y_chosen = y.loc[chosen_indices]

# # Select remaining data excluding chosen indices
# remaining_indices = [i for i in X_scaled.index if i not in chosen_indices]
# X_remaining = X_scaled.loc[remaining_indices]
# y_remaining = y.loc[remaining_indices]

# train_test_split on the remaining data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine chosen scaffolds with the training set from remaining data
# X_train = pd.concat([X_chosen, X_train_remaining], ignore_index=False)
# y_train = pd.concat([y_chosen, y_train_remaining], ignore_index=False)

print("X_train:", X_train.shape)
print("y_train", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape) 

X_train: (48, 6)
y_train (48,)
X_test: (12, 6)
y_test: (12,)


In [15]:
from sklearn.ensemble import AdaBoostRegressor

working_model2 = AdaBoostRegressor(
    base_estimator= DecisionTreeRegressor(max_depth=3),
    n_estimators=50,
    learning_rate=1,
    loss='square',
    random_state=42 )

working_model2.fit(X_train, y_train) 

In [8]:
#X_train

In [16]:
import pickle

# Save model
with open('working_model2.pkl', 'wb') as f:
    pickle.dump(working_model2, f)  

# Save scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

- Two compounds of interest are prediceted using their SMILES. 
- Compound 1 and 2.

In [29]:
%%time

import joblib  

new_smiles = ['C1=CN=CC=C1C(NN=CNC2=CC(=CC(=C2)OC)OC)=O',
'C1=CN=CC=C1C(NN=NC2=CC=CC=C2)=O']

new_smiles_df = pd.DataFrame({'canonical_smiles': new_smiles})
new_smiles_df.to_csv('compounds.smi', index=None, header=None)

# Generate descriptors (using PadelDescriptor or any other descriptor generator)
padeldescriptor(mol_dir='compounds.smi', d_file='descriptors_1.csv', d_2d=True,
 d_3d=False, fingerprints=True, retainorder=True)

# Load the pre-calculated descriptors for new SMILES
descriptors = pd.read_csv('descriptors_1.csv')

# Check the shape of new descriptors
print(f"Shape of new descriptors: {descriptors.shape}")

# Preprocessing
if 'Name' in descriptors.columns:
    descriptors = descriptors.drop('Name', axis=1)
    print("The 'Name' column has been dropped.")

# Select the features used
features = ['BCUTw-1l', 'GATS7c', 'AATS8s', 'GATS4c', 'GATS8i', 'GATS6p']

new_descriptors_selected = descriptors[features]

# Check for NaN or infinite values before scaling
if new_descriptors_selected.isnull().values.any():
    new_descriptors_selected = new_descriptors_selected.dropna()
    print("Warning: NaN values detected in descriptors and have been deleted")

# Load your trained scaler
scaler_ada = joblib.load('scaler.pkl')

# Scale the new descriptors using the loaded scaler
new_descriptors_scaled = scaler_ada.transform(new_descriptors_selected)

# Load your trained model
model_ada = joblib.load('working_model2.pkl')

# Make predictions using the scaled descriptors
predictions_pIC50 = model_ada.predict(new_descriptors_scaled)

# Convert predictions from pIC50 to nM
predictions_nM = 10 ** (9 - predictions_pIC50)

# Classify as active or not active based on the threshold of 1400 nM
activity_status = ["active" if pred < 1400 else "not active" for pred in predictions_nM]

# Display predictions and activity status
for pred, status in zip(predictions_pIC50, activity_status):
    print(f"Predicted activity (nM): {pred:.2f}, Status: {status}")


# for pred, status in zip(predictions_pIC50, activity_status):
#     print(f"Predicted activity (nM): {pred:.2f}, Status: {status}")

Shape of new descriptors: (2, 2326)
The 'Name' column has been dropped.
Predicted activity (nM): 8.96, Status: active
Predicted activity (nM): 8.96, Status: active
Wall time: 5.68 s
