In [15]:
%load_ext autoreload
%autoreload 2

import os
import sys
import ast
import random
import pickle

DIR = os.path.dirname(os.getcwd())
sys.path.append(DIR)

import math
import numpy as np
import pandas as pd

from rdkit.Chem import PandasTools
from chembl_webresource_client.new_client import new_client

In [38]:
from src.utils.chembl_data_processing import get_bioactivity_compound_data

In [3]:
DIR = os.path.dirname(os.getcwd())
DATA_PATH = os.path.join(DIR, "data")
DATA_PATH

'/Users/karinazadorozhny/Desktop/Projects/MPN_Project/molecular-message-passing/data'

Create a client to access ChemBL data

In [4]:
targets_api = new_client.target
compounds_api = new_client.molecule
bioactivities_api = new_client.activity

### Retrieve data for target protein: PARP1

In [5]:
# Poly [ADP-ribose] polymerase 1 
TARGET_NAME = "PARP1"
uniprot_id = "P09874"

In [6]:
# Use clients to collect data about the target

targets = targets_api.get(
    target_components__accession=uniprot_id)

# Targets_api returns a lazy Query Set. Retrieve the data into a pandas DataFrame
targets = pd.DataFrame.from_records(targets)
targets

Unnamed: 0,cross_references,organism,pref_name,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P09874', 'xref_name': None, 'xre...",Homo sapiens,Poly [ADP-ribose] polymerase-1,False,CHEMBL3105,"[{'accession': 'P09874', 'component_descriptio...",SINGLE PROTEIN,9606
1,"[{'xref_id': 'P09874', 'xref_name': None, 'xre...",Homo sapiens,Poly [ADP-ribose] polymerase-1,False,CHEMBL3105,"[{'accession': 'P09874', 'component_descriptio...",SINGLE PROTEIN,9606
2,[],Homo sapiens,"PARP 1, 2 and 3",False,CHEMBL3390820,"[{'accession': 'P09874', 'component_descriptio...",PROTEIN FAMILY,9606


In [7]:
# Select a correct from retrieved targets
target = targets.iloc[0]
target

cross_references      [{'xref_id': 'P09874', 'xref_name': None, 'xre...
organism                                                   Homo sapiens
pref_name                                Poly [ADP-ribose] polymerase-1
species_group_flag                                                False
target_chembl_id                                             CHEMBL3105
target_components     [{'accession': 'P09874', 'component_descriptio...
target_type                                              SINGLE PROTEIN
tax_id                                                             9606
Name: 0, dtype: object

### Get Bioactivity Data

Fetch the bioactivity data and filter it to only consider

* human proteins, 
* bioactivity type IC50, 
* exact measurements (relation `'='`), and
* binding data (assay type `'B'`).

In [8]:
bioactivities = bioactivities_api.filter(
    target_chembl_id=target.target_chembl_id, relation="=", assay_type="B", type="IC50"
).only(
    "activity_id",
    "assay_chembl_id",
    "assay_description",
    "assay_type",
    "molecule_chembl_id",
    "type",
    "standard_units",
    "relation",
    "standard_value",
    "target_chembl_id",
    "target_organism",
)

print(f"Length and type of bioactivities object: {len(bioactivities)}, {type(bioactivities)}")
# NBVAL_CHECK_OUTPUT

Length and type of bioactivities object: 2300, <class 'chembl_webresource_client.query_set.QuerySet'>


#### Preprocess and filter bioactivity data

1. Convert `standard_value`'s datatype from `object` to `float`
2. Delete entries with missing values
3. Keep only entries with `standard_unit == nM`
4. Delete duplicate molecules
5. Reset `DataFrame` index

In [10]:
bioactivities_df = preprocess_compound_activity_dataset(
    df = bioactivities_df,
    columns_to_float = ["standard_value"],
    standard_unit = "nM"
)

Initial number of samples: 2301 

	 Entries with NaNs: 0.
	 New number of samples: 2301 

	 Entries with non-standard units: 2
	 New number of samples: 2299

	 Entries duplicates: 247
	 New number of samples: 2052

Final number of samples: 2052 



### Get Compound Data

In [11]:
compounds = compounds_api.filter(
    molecule_chembl_id__in=list(bioactivities_df["molecule_chembl_id"])
).only("molecule_chembl_id", "molecule_structures")

In [24]:
compounds_df = retrieve_or_create_dataframe_from_query(
    query = compounds,
    query_name = "compounds",
    data_path = DATA_PATH,
    target_name = TARGET_NAME,
)

compounds_df.head()

compounds shape: (2053, 2)


Unnamed: 0,molecule_chembl_id,molecule_structures
0,CHEMBL267373,"{'canonical_smiles': 'NC(=O)c1ccccc1', 'molfil..."
1,CHEMBL267373,"{'canonical_smiles': 'NC(=O)c1ccccc1', 'molfil..."
2,CHEMBL266540,"{'canonical_smiles': 'O=c1[nH]cnc2ccccc12', 'm..."
3,CHEMBL28,{'canonical_smiles': 'O=c1cc(-c2ccc(O)cc2)oc2c...
4,CHEMBL16861,{'canonical_smiles': 'Cc1ccc(-c2cc(=O)c3ccccc3...


In [25]:
compounds_df = preprocess_compound_activity_dataset(
    df = compounds_df,
)

Initial number of samples: 2053 

	 Entries with NaNs: 0.
	 New number of samples: 2053 

	 Entries duplicates: 1
	 New number of samples: 2052

Final number of samples: 2052 



#### Get molecules with canonical SMILES

In [26]:
compounds_df.head()

Unnamed: 0,molecule_chembl_id,molecule_structures
0,CHEMBL267373,"{'canonical_smiles': 'NC(=O)c1ccccc1', 'molfil..."
1,CHEMBL266540,"{'canonical_smiles': 'O=c1[nH]cnc2ccccc12', 'm..."
2,CHEMBL28,{'canonical_smiles': 'O=c1cc(-c2ccc(O)cc2)oc2c...
3,CHEMBL16861,{'canonical_smiles': 'Cc1ccc(-c2cc(=O)c3ccccc3...
4,CHEMBL275638,{'canonical_smiles': 'O=c1cc(-c2ccccc2)oc2cccc...


### Merge bioactivities and compounds data

In [27]:
compounds_df = extract_smiles_from_molecular_representation(compounds_df)
compounds_df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles
0,CHEMBL267373,NC(=O)c1ccccc1
1,CHEMBL266540,O=c1[nH]cnc2ccccc12
2,CHEMBL28,O=c1cc(-c2ccc(O)cc2)oc2cc(O)cc(O)c12
3,CHEMBL16861,Cc1ccc(-c2cc(=O)c3ccccc3o2)cc1
4,CHEMBL275638,O=c1cc(-c2ccccc2)oc2ccccc12


Merge values of interest from bioactivities_df and compounds_df in an output_df based on the compounds' ChEMBL IDs (molecule_chembl_id), keeping the following columns:

1. ChEMBL IDs: molecule_chembl_id
2. SMILES: smiles
3. units: units
4. IC50: IC50

In [44]:
merged = get_bioactivity_compound_data(
    bioacitivity_query=bioactivities,
    compound_query=compounds, 
    data_path=DATA_PATH,
    target_name=TARGET_NAME)

1. Get and process bioactivity dataset

	Initial number of samples: 2301 

		 Entries with NaNs: 0.
		 New number of samples: 2301 

		 Entries with non-standard units: 2
		 New number of samples: 2299

		 Entries duplicates: 247
		 New number of samples: 2052

	Number of samples: 2052 

2. Get and process compounds dataset

	Initial number of samples: 2053 

		 Entries with NaNs: 0.
		 New number of samples: 2053 

		 Entries duplicates: 1
		 New number of samples: 2052

	Number of samples: 2052 

3. Merge bioactivity and compound dataset

	Final number of samples:  2052


In [45]:
merged

Unnamed: 0,molecule_chembl_id,standard_value,standard_units,canonical_smiles
0,CHEMBL108702,20.00,nM,Nc1cccc(-c2ccc(C(=O)CNC(=O)CCn3c4c(c(=O)[nH]c3...
1,CHEMBL418816,3000.00,nM,O=C(CCn1c2c(c(=O)[nH]c1=O)CCC2)NCC(=O)N1CCN(c2...
2,CHEMBL108968,30.00,nM,O=C(CCn1c2c(c(=O)[nH]c1=O)CSCC2)NCC(=O)c1ccc(O...
3,CHEMBL430707,35.00,nM,O=C(CCn1c2c(c(=O)[nH]c1=O)CSCC2)NCC(=O)N1CCN(c...
4,CHEMBL321638,15.00,nM,COc1ccc(-c2ccc(C(=O)CNC(=O)CCn3c4c(c(=O)[nH]c3...
...,...,...,...,...
2047,CHEMBL4638701,89.60,nM,O=c1[nH]c(SCCN2CCN(CCSc3nc4ccccc4c(=O)[nH]3)CC...
2048,CHEMBL4636062,4.23,nM,Nc1ccccc1NC(=O)CCCCCNC(=O)c1cc(Cc2n[nH]c(=O)c3...
2049,CHEMBL4639985,1.94,nM,Nc1ccccc1NC(=O)C1CCN(C(=O)c2cc(Cc3n[nH]c(=O)c4...
2050,CHEMBL4642383,1.81,nM,Nc1ccc(F)cc1NC(=O)C1CCN(C(=O)c2cc(Cc3n[nH]c(=O...
