In [None]:
! pip install chembl_webresource_client
! pip install rdkit-pypi

In [None]:
import math
from pathlib import Path
from zipfile import ZipFile
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm

In [None]:
HERE = Path(_dh[-1])
DATA = HERE / "data"

In [None]:
targets_api = new_client.target
compounds_api = new_client.molecule
bioactivities_api = new_client.activity

In [None]:
type(targets_api)

In [None]:
uniprot_id = "Q06187"

In [None]:
targets = targets_api.get(target_components__accession=uniprot_id).only(
    "target_chembl_id", "organism", "pref_name", "target_type"
)
print(f'The type of the targets is "{type(targets)}"')

In [None]:
targets = pd.DataFrame.from_records(targets)
targets

In [None]:
target = targets.iloc[0]
target

In [None]:
chembl_id = target.target_chembl_id
print(f"The target ChEMBL ID is {chembl_id}")

In [None]:
bioactivities = bioactivities_api.filter(
    target_chembl_id=chembl_id, type="IC50", relation="=", assay_type="B"
).only(
    "activity_id",
    "assay_chembl_id",
    "assay_description",
    "assay_type",
    "molecule_chembl_id",
    "type",
    "standard_units",
    "relation",
    "standard_value",
    "target_chembl_id",
    "target_organism",
)

print(f"Length and type of bioactivities object: {len(bioactivities)}, {type(bioactivities)}")

In [None]:
print(f"Length and type of first element: {len(bioactivities[0])}, {type(bioactivities[0])}")
bioactivities[0]

In [None]:
bioactivities_df = pd.DataFrame.from_dict(bioactivities)
print(f"DataFrame shape: {bioactivities_df.shape}")

In [None]:
bioactivities_df.drop(["units", "value"], axis=1, inplace=True)
bioactivities_df.head()

In [None]:
bioactivities_df["units"].unique()

In [None]:
bioactivities_df

In [None]:
bioactivities_df.dtypes

In [None]:
bioactivities_df = bioactivities_df.astype({"standard_value": "float64"})
bioactivities_df.dtypes

In [None]:
bioactivities_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {bioactivities_df.shape}")

In [None]:
print(f"Units in downloaded data: {bioactivities_df['standard_units'].unique()}")
print(
    f"Number of non-nM entries:\
    {bioactivities_df[bioactivities_df['standard_units'] != 'nM'].shape[0]}"
)

In [None]:
bioactivities_df = bioactivities_df[bioactivities_df["standard_units"] == "nM"]
print(f"Units after filtering: {bioactivities_df['standard_units'].unique()}")

In [None]:
print(f"DataFrame shape: {bioactivities_df.shape}")

In [None]:
bioactivities_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)
print(f"DataFrame shape: {bioactivities_df.shape}")

In [None]:
bioactivities_df.reset_index(drop=True, inplace=True)
bioactivities_df.head(10)

In [None]:
bioactivities_df.rename(
    columns={"standard_value": "IC50", "standard_units": "units"}, inplace=True
)
bioactivities_df.head(10)

In [None]:
print(f"DataFrame shape: {bioactivities_df.shape}")

In [None]:
compounds_provider = compounds_api.filter(
    molecule_chembl_id__in=list(bioactivities_df["molecule_chembl_id"])
).only("molecule_chembl_id", "molecule_structures")

In [None]:
compounds = list(tqdm(compounds_provider))

In [None]:
compounds_df = pd.DataFrame.from_records(
    compounds,
)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
compounds_df.head()

In [None]:
compounds_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
compounds_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
compounds_df.iloc[0].molecule_structures.keys()

In [None]:
canonical_smiles = []

for i, compounds in compounds_df.iterrows():
    try:
        canonical_smiles.append(compounds["molecule_structures"]["canonical_smiles"])
    except KeyError:
        canonical_smiles.append(None)

compounds_df["smiles"] = canonical_smiles
compounds_df.drop("molecule_structures", axis=1, inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
compounds_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
print(f"Bioactivities filtered: {bioactivities_df.shape[0]}")
bioactivities_df.columns

In [None]:
print(f"Compounds filtered: {compounds_df.shape[0]}")
compounds_df.columns

In [None]:
output_df = pd.merge(
    bioactivities_df[["molecule_chembl_id", "IC50", "units"]],
    compounds_df,
    on="molecule_chembl_id",
)

output_df.reset_index(drop=True, inplace=True)

print(f"Dataset with {output_df.shape[0]} entries.")

In [None]:
output_df.dtypes

In [None]:
output_df.head(10)

In [None]:
import math

def convert_ic50_to_pic50(IC50_value):
    if IC50_value > 0:
        pIC50_value = 9 - math.log10(IC50_value)
        return pIC50_value
    else:
        return None
output_df["pIC50"] = output_df.apply(lambda x: convert_ic50_to_pic50(x.IC50), axis=1)

In [None]:
output_df.head(10)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("white")
sns.set_palette("Set2")
plt.figure(figsize=(10, 6))
sns.histplot(output_df["pIC50"], bins=10, kde=False)
plt.title("pIC50", fontsize=16)
plt.xlabel("pIC50", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(False)
sns.despine()
plt.savefig("pIC50_histogram.png", dpi=600)
plt.show()


In [None]:
bioactivity_threshold = []
for i in output_df.IC50:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

In [None]:
bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df2 = pd.concat([output_df, bioactivity_class], axis=1)
df2

In [None]:
df2.to_csv("/content/sample_data/output_file_first_step.csv")
df2.head(10)

In [None]:
print(f"DataFrame shape: {df2.shape}")