## Installs and imports

In [1]:
!pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.8-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests-cache~=0.7.0
  Downloading requests_cache-0.7.5-py3-none-any.whl (39 kB)
Collecting easydict
  Downloading easydict-1.10.tar.gz (6.4 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting urllib3
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.18.4
  Downloading requests-2.28.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting certifi>=2017.4.17
  Downloading certifi-2022.12.7-py3-none-any.whl (155 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install deepchem

Collecting deepchem
  Downloading deepchem-2.7.1-py3-none-any.whl (693 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.2/693.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting scipy<1.9
  Downloading scipy-1.8.1-cp310-cp310-macosx_12_0_universal2.macosx_10_9_x86_64.whl (55.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.7/55.7 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting rdkit
  Downloading rdkit-2022.9.5-cp310-cp310-macosx_10_9_x86_64.whl (24.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.8/24.8 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: scipy, rdkit, deepchem
  Attempting uninstall: scipy
    Found existing installation: scipy 1.9.1
    Uninstalling scipy-1.9.1:
      Successfully uninstalled scipy-1.9.1
Successfully installed deepchem-2.7.1 rdkit-2022.9.5 scipy-1.8.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m

In [4]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.65.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import math
from pathlib import Path
from zipfile import ZipFile
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd

from rdkit.Chem import PandasTools

from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Retrieve data from ChEMBL (this takes many hours to run)
Adapted from: https://github.com/chembl/notebooks/blob/main/ChEMBL_webresource_client_examples.ipynb

In [6]:
targets_api = new_client.target
compounds_api = new_client.molecule
bioactivities_api = new_client.activity
type(targets_api)

chembl_webresource_client.query_set.QuerySet

## Get bioactivity data (no structures)

In [7]:
bioactivities = bioactivities_api.filter(
    target_chembl_id='CHEMBL395', assay_chembl_id ="CHEMBL1794553"
).only(
    "activity_id",
    "assay_chembl_id",
    "assay_description",
    "assay_type",
    "molecule_chembl_id",
    "type",
    "standard_units",
    "relation",
    "standard_value",
    "target_chembl_id",
    "target_organism",
)
len(bioactivities)

58459

In [10]:
bioactivities_df = pd.DataFrame.from_records(bioactivities)
print(f"DataFrame shape: {bioactivities_df.shape}")
bioactivities_df.head()

DataFrame shape: (58460, 13)


Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,6961561,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1315471,,nM,35481.3,CHEMBL395,Homo sapiens,Potency,uM,35.4813
1,6958176,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1511566,,nM,2818.4,CHEMBL395,Homo sapiens,Potency,uM,2.8184
2,6958177,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1904108,,nM,35481.3,CHEMBL395,Homo sapiens,Potency,uM,35.4813
3,6958178,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1369478,,nM,35481.3,CHEMBL395,Homo sapiens,Potency,uM,35.4813
4,6958179,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1200833,,nM,12589.3,CHEMBL395,Homo sapiens,Potency,uM,12.5893


In [11]:
bioactivities_df.to_csv("HepG2_Bioactives_df.csv")

In [21]:
bioactivities_df = pd.read_csv("HepG2_Bioactives_df.csv")

In [22]:
bioactivities_df.head()

Unnamed: 0.1,Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,0,6961561,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1315471,,nM,35481.3,CHEMBL395,Homo sapiens,Potency,uM,35.4813
1,1,6958176,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1511566,,nM,2818.4,CHEMBL395,Homo sapiens,Potency,uM,2.8184
2,2,6958177,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1904108,,nM,35481.3,CHEMBL395,Homo sapiens,Potency,uM,35.4813
3,3,6958178,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1369478,,nM,35481.3,CHEMBL395,Homo sapiens,Potency,uM,35.4813
4,4,6958179,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1200833,,nM,12589.3,CHEMBL395,Homo sapiens,Potency,uM,12.5893


In [23]:
bioactivities_df["standard_units"].unique()

array(['nM'], dtype=object)

In [24]:
bioactivities_df.drop(["units", "value"], axis=1, inplace=True)
bioactivities_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)
print(f"DataFrame shape: {bioactivities_df.shape}")

DataFrame shape: (58172, 12)


In [25]:
bioactivities_df.reset_index(drop=True, inplace=True)
bioactivities_df.head()

Unnamed: 0.1,Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type
0,0,6961561,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1315471,,nM,35481.3,CHEMBL395,Homo sapiens,Potency
1,1,6958176,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1511566,,nM,2818.4,CHEMBL395,Homo sapiens,Potency
2,2,6958177,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1904108,,nM,35481.3,CHEMBL395,Homo sapiens,Potency
3,3,6958178,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1369478,,nM,35481.3,CHEMBL395,Homo sapiens,Potency
4,4,6958179,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1200833,,nM,12589.3,CHEMBL395,Homo sapiens,Potency


In [26]:
bioactivities_df.rename(
    columns={"standard_value": "IC50", "standard_units": "units"}, inplace=True
)
bioactivities_df.head()

Unnamed: 0.1,Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,units,IC50,target_chembl_id,target_organism,type
0,0,6961561,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1315471,,nM,35481.3,CHEMBL395,Homo sapiens,Potency
1,1,6958176,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1511566,,nM,2818.4,CHEMBL395,Homo sapiens,Potency
2,2,6958177,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1904108,,nM,35481.3,CHEMBL395,Homo sapiens,Potency
3,3,6958178,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1369478,,nM,35481.3,CHEMBL395,Homo sapiens,Potency
4,4,6958179,CHEMBL1794553,PUBCHEM_BIOASSAY: qHTS for Inhibitors of TGF-b...,F,CHEMBL1200833,,nM,12589.3,CHEMBL395,Homo sapiens,Potency


In [27]:
bioactivities_df.to_csv("HepG2_Bioactives_df_partialCleaning.csv")

## Get compound data

In [28]:
compounds_provider = compounds_api.filter(
    molecule_chembl_id__in=list(bioactivities_df["molecule_chembl_id"])
).only("molecule_chembl_id", "molecule_structures")

In [30]:
len(compounds_provider)

58172

In [37]:
compounds = list(tqdm(compounds_provider))
compounds_df = pd.DataFrame.from_records(
    compounds,
)
print(f"DataFrame shape: {compounds_df.shape}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 58172/58172 [2:42:08<00:00,  5.98it/s]

DataFrame shape: (58172, 2)





In [38]:
compounds_df.to_csv("HepG2_compounds_df.csv")

In [39]:
compounds_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

DataFrame shape: (58135, 2)


In [40]:
compounds_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

DataFrame shape: (58135, 2)


In [41]:
compounds_df.iloc[0].molecule_structures.keys()

dict_keys(['canonical_smiles', 'molfile', 'standard_inchi', 'standard_inchi_key'])

In [42]:
canonical_smiles = []

for i, compounds in compounds_df.iterrows():
    try:
        canonical_smiles.append(compounds["molecule_structures"]["canonical_smiles"])
    except KeyError:
        canonical_smiles.append(None)

compounds_df["smiles"] = canonical_smiles
compounds_df.drop("molecule_structures", axis=1, inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

DataFrame shape: (58135, 2)


In [43]:
compounds_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

DataFrame shape: (58135, 2)


In [44]:
compounds_df.to_csv("hepg2_compoundsdf_clean.csv")

## Merge compound and bioactivity data

In [45]:
output_df = pd.merge(
    bioactivities_df[["molecule_chembl_id", "IC50", "units"]],
    compounds_df,
    on="molecule_chembl_id",
)

# Reset row indices
output_df.reset_index(drop=True, inplace=True)

print(f"Dataset with {output_df.shape[0]} entries.")

Dataset with 58135 entries.


In [46]:
output_df.head()

Unnamed: 0,molecule_chembl_id,IC50,units,smiles
0,CHEMBL1315471,35481.3,nM,CCN(CC(=O)NCc1cccs1)S(=O)(=O)c1cc(NC(C)=O)ccc1OC
1,CHEMBL1511566,2818.4,nM,O=C(CN(c1ccc(F)cc1)S(=O)(=O)c1ccc2c(c1)OCCO2)N...
2,CHEMBL1904108,35481.3,nM,O=C(CCN1C(=O)/C(=C\c2ccccc2F)SC1=S)N1CCOCC1
3,CHEMBL1369478,35481.3,nM,COc1cc2c(cc1OC)C(c1ccc(-c3ccc(Cl)c(Cl)c3)o1)=NCC2
4,CHEMBL1200833,12589.3,nM,CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c...


In [47]:
output_df.to_csv("hepg2_output_df.csv")