In [1]:
import pandas as pd
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
import pubchempy as pcp
import numpy as np

In [2]:
def pIC50(input):
    pIC50 = []
    for molar in input['standard_value_norm']:
        # Check if molar is zero or negative
        if molar > 0:
            pIC50.append(-np.log10(molar))
        else:
            pIC50.append(np.nan)  # or use a small positive number if you prefer

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', axis=1)  # Use 'axis=1' for clarity
    return x
def norm_value(input):
    norm = []
    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)
    input['standard_value_norm'] = norm
    x = input.drop('standard_value', axis=1)
    return x

In [3]:
# Target search for coronavirus
protein='Butyrylcholinesterase'
target = new_client.target
target_query = target.search(protein)
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Butyrylcholinesterase,25.0,False,CHEMBL1914,"[{'accession': 'P06276', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Equus caballus,Butyrylcholinesterase,25.0,False,CHEMBL5077,"[{'accession': 'Q9N1N9', 'component_descriptio...",SINGLE PROTEIN,9796
2,[],Equus caballus,Cholinesterase,25.0,False,CHEMBL5763,"[{'accession': 'P81908', 'component_descriptio...",SINGLE PROTEIN,9796
3,[],Homo sapiens,Cholinesterases; ACHE & BCHE,25.0,False,CHEMBL2095233,"[{'accession': 'P06276', 'component_descriptio...",SELECTIVITY GROUP,9606
4,[],Canis lupus familiaris,Cholinesterase,25.0,False,CHEMBL4630814,"[{'accession': 'P32750', 'component_descriptio...",SINGLE PROTEIN,9615
5,[],Mus musculus,Butyrylcholinesterase,18.0,False,CHEMBL2528,"[{'accession': 'Q03311', 'component_descriptio...",SINGLE PROTEIN,10090
6,[],Rattus norvegicus,Butyrylcholinesterase,18.0,False,CHEMBL3403,"[{'accession': 'Q9JKC1', 'component_descriptio...",SINGLE PROTEIN,10116
7,[],Rattus norvegicus,Acetylcholinesterase and butyrylcholinesterase...,10.0,False,CHEMBL2111475,"[{'accession': 'P37136', 'component_descriptio...",SELECTIVITY GROUP,10116


In [4]:
selected_target = targets.target_chembl_id[0]
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)
#df.to_csv(protein+'_raw.csv', index=False)
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]
df2_nr = df2.drop_duplicates(['canonical_smiles'])
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]
df3

  df2 = df2[df.canonical_smiles.notna()]


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,920.0
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,900.0
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,1000.0
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,200.0
...,...,...,...
5071,CHEMBL1235966,COc1cc2c(cc1O)[C@@H]1Cc3ccc(OC)c(O)c3CN1CC2,61300.0
5072,CHEMBL5420095,COc1cc2c(c(O)c1OC)CC[N@+]1(C)Cc3c(ccc(O)c3OC)C...,300000.0
5073,CHEMBL5425587,COc1cc2c(c(O)c1OC)CC[n+]1cc3c(OC)c(O)ccc3cc1-2,300000.0
5074,CHEMBL5434024,COc1cc2c(cc1O)CC[n+]1cc3c(OC)c(O)ccc3cc1-2,118800.0


In [5]:
df_no_smiles = df3.drop(columns='canonical_smiles')
smilesx = []
for i in df3.canonical_smiles.tolist():
  cpd = str(i).split('.')
  cpd_longest = max(cpd, key = len)
  smilesx.append(cpd_longest)
smilesx = pd.Series(smilesx, name = 'canonical_smiles')
df_clean_smiles = pd.concat([df_no_smiles.reset_index(drop=True),smilesx], axis=1)
df_clean_smiles['standard_value'] = df_clean_smiles['standard_value'].apply(lambda x:float(x))
df_norm = norm_value(df_clean_smiles)
df_final = pIC50(df_norm)
df_final

Unnamed: 0,molecule_chembl_id,canonical_smiles,pIC50
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,-2.963788
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,-2.954243
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,-4.698970
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,-3.000000
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,-2.301030
...,...,...,...
4061,CHEMBL1235966,COc1cc2c(cc1O)[C@@H]1Cc3ccc(OC)c(O)c3CN1CC2,-4.787460
4062,CHEMBL5420095,COc1cc2c(c(O)c1OC)CC[N@+]1(C)Cc3c(ccc(O)c3OC)C...,-5.477121
4063,CHEMBL5425587,COc1cc2c(c(O)c1OC)CC[n+]1cc3c(OC)c(O)ccc3cc1-2,-5.477121
4064,CHEMBL5434024,COc1cc2c(cc1O)CC[n+]1cc3c(OC)c(O)ccc3cc1-2,-5.074816


In [6]:
smiles=[k for k in df_final.canonical_smiles]
p50=[k for k in df_final.pIC50]

In [None]:
from datetime import datetime
start_time = datetime.now()
pubchem_compound =[];details=[]
i=0
for sm in smiles:
    #print(i)
    details.append([sm,p50[i]])
    i=i+1
    pc=pcp.get_compounds(sm, 'smiles')[0]
    pubchem_compound.append(pc)
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

In [None]:
pcomp=[k for k in pubchem_compound if k.cid!=None]
fingerprint = pd.DataFrame([np.array([int(bit) for bit in pc.cactvs_fingerprint]) for pc in pcomp])
fingerprint.columns=['pubchemfp'+str(k) for k in range(len(fingerprint.columns))]
fingerprint=pd.concat([pd.DataFrame(details,columns=['smiles','pIC50']),fingerprint],axis=1)
fingerprint

In [9]:
filename='descriptor_'+protein+'.csv'

#Save Fingerprint details and pIC50

fingerprint.to_csv(filename,index=False)