In [1]:
# Goals of notebook:
## 1. Access ChEMBL database
## 2. Shortlist active compounds

In [2]:
# !pip install chembl_webresource_client
from chembl_webresource_client.new_client import new_client
import pandas as pd
import re
from tqdm import tqdm

  __version__ = __import__('pkg_resources').get_distribution('chembl_webresource_client').version


In [3]:
#connecting to chemBL
activity = new_client.activity
assay = new_client.assay
molecule = new_client.molecule
target = new_client.target

In [4]:
kras_targets = target.filter(
    target_synonym__icontains="KRAS",
    target_type="SINGLE PROTEIN",
    organism="Homo sapiens"
)

kras_target_ids = [
    t["target_chembl_id"]
    for t in kras_targets
]

print(f"Found {len(kras_target_ids)} KRAS targets")

Found 1 KRAS targets


In [5]:
# Convert to DataFrame for inspection
df_targets = pd.DataFrame([
    {
        "target_chembl_id": t["target_chembl_id"],
        "pref_name": t["pref_name"],
        "organism": t["organism"],
        "target_type": t["target_type"],
        "uniprot_accessions": t.get("target_components", [{}])[0].get("accession")
    }
    for t in kras_targets
])

df_targets.head()

Unnamed: 0,target_chembl_id,pref_name,organism,target_type,uniprot_accessions
0,CHEMBL2189121,GTPase KRas,Homo sapiens,SINGLE PROTEIN,P01116


In [6]:
#filter for those targeting G12D mutation and confidence score >= 8

candidate_assays = []

for tid in kras_target_ids:
    assays = assay.filter(target_chembl_id=tid)
    for a in assays:
        desc = (a.get("description") or "").lower()
        if "g12d" in desc:
            candidate_assays.append({
                "assay_chembl_id": a["assay_chembl_id"],
                "assay_type": a.get("assay_type"),
                "description": a.get("description")
            })

print(f"Candidate G12D assays: {len(candidate_assays)}")

Candidate G12D assays: 85


In [7]:
# Convert to DataFrame for inspection

df_assays = pd.DataFrame([
    {
        "assay_chembl_id": c["assay_chembl_id"],
        "assay_type": c["assay_type"],
        "description": c["description"]
    }
    for c in candidate_assays
])

df_assays.sample(5)

Unnamed: 0,assay_chembl_id,assay_type,description
49,CHEMBL5161453,B,Inhibition of KRAS G12D mutant downstream sign...
75,CHEMBL5553095,B,Binding affinity to biotinylated KRAS G12D mut...
50,CHEMBL5217977,B,Binding affinity to KRAS G12D mutant (unknown ...
1,CHEMBL2399317,B,Binding affinity to KRAS G12D mutant (unknown ...
78,CHEMBL5737727,B,Ras GTP Binding Domain Inhibition Assay: The f...


In [8]:
#all assay types seem labelled as binding

assay_types = df_assays["assay_type"].unique()
print(assay_types)

['B']


In [9]:
descriptions = df_assays["description"].unique()
descriptions.shape

(83,)

In [10]:
#can we filter out cell based assays, non-allosteric site bindings and covalent inhibitors based on description?

for d in descriptions:
  print(d)

Binding affinity to K-Ras G12D mutant-GDP complex
Binding affinity to KRAS G12D mutant (unknown origin)
Inhibition of biotinylated K-Ras G12D mutant (unknown origin) assessed as reduction in SOS1 (564 to 1049 residues)-mediated BODIPY-GDP-GTP exchange after 1 hr by TR-FRET assay
Inhibition of biotinylated K-Ras G12D mutant (unknown origin) assessed as reduction in SOS1 (564 to 1049 residues)-mediated BODIPY-GDP-GTP exchange after 1 hr in presence of DTT by TR-FRET assay
Binding affinity to human N-terminal His/AVi-tagged biotinylated GTPase KRas G12D mutant (1 to 169 residues) expressed in Escherichia coli BL21 (DE3) in presence of GDP by SPR assay
Binding affinity to human N-terminal His/AVi-tagged biotinylated GTPase KRas G12D mutant (1 to 169 residues) expressed in Escherichia coli BL21 (DE3) in presence of GTP by SPR assay
Binding affinity to human N-terminal His/AVi-tagged biotinylated GTPase KRas G12D mutant (1 to 169 residues) expressed in Escherichia coli BL21 (DE3) assessed as

In [None]:
# Assays to leave OUT
## Cellular assays

In [None]:
# 1. Define your exclusion keywords
exclusion_keywords = [
    'cell'
]

# 2. Create the filtering pattern
pattern = '|'.join(exclusion_keywords)

# 3. Apply the filter to your dataframe
## Assumes your dataframe is 'df' and the column is 'assay_description'
descriptions_filtered = df_assays[~df_assays['description'].str.contains(pattern, case=False, na=False)].copy()

(65, 3)

In [25]:
descriptions_filtered_test = df_assays[df_assays['description'].str.contains(pattern, case=False, na=False)].copy()

descriptions_filtered_test.shape
for d in descriptions_filtered_test['description']:
  print(d)

Inhibition of KRAS G12D mutant in human AGS cells assessed as reduction in ERK phosphorylation measured after 3 hrs by Western blot analysis
Binding affinity to 15N-labelled GDP bound KRas G12D mutant ( 1 to 169 residues) (unknown origin) expressed in Escherichia coli Rosetta 2 (DE3) cells by 1H/15N- HSQC spectroscopy
Inhibition of KRas G12D mutant (unknown origin) expressed in HEK293T cells coexpressing GFP-fused CRAF RBD assessed as reduction in Ras-Raf interaction using coelenterazine 400a as substrate incubated for 20 hrs by BRET2 assay
Inhibition of KRas G12D mutant (unknown origin) expressed in HEK293T cells coexpressing GFP-fused CRAF RBD assessed as reduction in Ras-Raf interaction using coelenterazine 400a as substrate up to 15 uM incubated for 20 hrs by BRET2 assay
Inhibition of KRAS G12D mutant in human AGS cells assessed as reduction in ERK phosphorylation measured after 3 hrs by In-Cell Western assay
Inhibition of KRAS in human ASPC1 cells harboring KRAS G12D mutant assess

In [None]:
# 4. Optional: Create a 'Source' tag for easier analysis later
df_filtered['assay_type'] = df_filtered['assay_description'].apply(
    lambda x: 'Cellular' if any(cell in x.upper() for cell in ['CELL', 'AGS', 'ASPC1', 'PANC-1', 'GP2D']) else 'Biochemical'
)

print(f"Original assays: {len(df)}")
print(f"Cleaned assays: {len(df_filtered)}")

In [None]:
activities_query = activity.filter(
    assay_chembl_id__in=df_assays["assay_chembl_id"].unique().tolist()
  ).only(
    'molecule_chembl_id',
    'target_chembl_id',
    'standard_type',
    'standard_value',
    'standard_units',
    'standard_relation',
    'assay_type',
    'confidence_score',
    'canonical_smiles',
    'data_validity_comment'
  )

In [None]:
total_count = len(activities_query)
print(f'Total records found on server: {total_count}')

activity_list = []
for act in tqdm(activities_query, total=total_count, desc="Fetching ChEMBL Data"):
  activity_list.append(act)

df_activities = pd.DataFrame(activity_list)
csv_filename = "KRAS_G12D_chemBL_raw.csv"
df_activities.to_csv(csv_filename, index=False)
print(f'\nDownload complete!')
print(f'Total records saved: {len(df_activities)}')
if not df_activities.empty:
  print(f'Types of measurements found: {df_activities["standard_type"].unique()}')

In [None]:
df_activities[['standard_type', 'standard_units']].value_counts()

In [None]:
df_activities[['type', 'units']].value_counts()

In [None]:
#need to further filter the list of actives here

accepted_types = ["Kd", "IC50"]
df_filtered = df_activities[df_activities["standard_value"].notna() & (df_activities["relation"] == "=") & df_activities["standard_type"].isin(accepted_types)]
df_filtered['standard_value'] = pd.to_numeric(df_filtered['standard_value'], errors='coerce')  # non-numeric -> NaN
df_filtered = df_filtered[df_filtered['standard_value'].notna()]  # keep only rows where conversion succeeded

In [None]:
#number of known actives in file

print(df_filtered["molecule_chembl_id"].nunique())

In [None]:
#only IC50 for plots, scale is nM

df_IC50_nM = df_filtered[df_filtered["standard_type"] == "IC50"]
print(df_IC50_nM["molecule_chembl_id"].nunique())

In [None]:
# ============================================================
# ACTIVITY DISTRIBUTION VISUALIZATION
# ============================================================

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(10, 4))

# Histogram
ax1 = axes[0]
ax1.hist(df_IC50_nM["standard_value"], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
ax1.axvline(x=1000.0, color='red', linestyle='--', linewidth=2, label='Activity threshold')
ax1.axvline(x=df_IC50_nM["standard_value"].mean(), color='green', linestyle='-', linewidth=2, label=f'Mean: {df_IC50_nM["standard_value"].mean():.2f}')
ax1.set_xlabel('IC50 (nM)', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.set_title('Activity Distribution', fontsize=13, fontweight='bold')
ax1.legend()

# Box plot
ax2 = axes[1]
bp = ax2.boxplot(df_IC50_nM["standard_value"], patch_artist=True)
bp['boxes'][0].set_facecolor('lightblue')
ax2.set_ylabel('IC50 (nM)', fontsize=12)
ax2.set_title('Activity Box Plot', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.savefig('figures/activity_distribution.png', dpi=300, bbox_inches='tight')
plt.show()