This script scrapes the IUPHAR/BPS webpage for targets, producing a dataset comprised of the agonists, antagonists, and allosteric modulators associated with that target (including their SMILES strings)

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
def parse_table(soup, interact_type):
    table = soup.find("table", id=interact_type)

    if not table or not table.find_all("tr"):
        return pd.DataFrame(columns=['Compound', 'Species', 'Action', 'Value', 'Parameter'])
    
    rows = []
    for tr in table.find_all("tr")[1:]:  # Skip the header row
        cells = tr.find_all("td")
        if not cells:
            continue

        row_data = [td.text.strip() for td in cells]
        if len(row_data) < 15:
            continue

        compound_name = row_data[0]
        species = row_data[11]
        action = row_data[12]
        value = row_data[13]
        param = row_data[14]

        if species == "Hs":
            # Find SMILES string
            compound_url = "https://www.guidetopharmacology.org/GRAC/" + cells[0].find("a")["href"]
            smiles = extract_smiles(compound_url)
            rows.append([compound_name, smiles, species, action, value, param])


    df = pd.DataFrame(rows, columns=['Compound', 'Smiles', 'Species', 'Action', 'Value', 'Parameter'])
    return df

def extract_smiles(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find the second tr with class "tablesorter-childRow info"
    # CHANGE TO FINDING TR TITLED "SMILES / InChI / InChIKey"
    trs = soup.find_all("tr", class_="tablesorter-childRow info")
    if len(trs) < 2:
        print("No SMILES found for: ", url)
        return None
    
    tr = trs[1]
    smiles_row = tr.find("tr")
    smiles = smiles_row.find_all("td")[1].text.strip()

    return smiles


In [None]:
urls = {"dopamine_d1" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=214&familyId=20&familyType=GPCR",
        "dopamine_d2" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=215&familyId=20&familyType=GPCR",
        "dopamine_d3" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=216&familyId=20&familyType=GPCR",
        "dopamine_d4" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=217&familyId=20&familyType=GPCR",
        "dopamine_d5" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=218&familyId=20&familyType=GPCR",
        "serotonin_5ht1a" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=1&familyId=1&familyType=GPCR",
        "serotonin_5ht1b" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=2&familyId=1&familyType=GPCR",
        "serotonin_5ht1d" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=3&familyId=1&familyType=GPCR",
        "serotonin_5ht1e" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=4&familyId=1&familyType=GPCR",
        "serotonin_5ht1f" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=5&familyId=1&familyType=GPCR",
        "serotonin_5ht2a" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=6&familyId=1&familyType=GPCR",
        "serotonin_5ht2b" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=7&familyId=1&familyType=GPCR",
        "serotonin_5ht2c" : "https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=8&familyId=1&familyType=GPCR"
        }

data = pd.DataFrame()
for name, url in urls.items():
    print("Procesing", name)

    # Send a GET request to the URL
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")
    
    for interact_type in ["agonists", "antagonists", "allosterics"]:
        df = parse_table(soup, interact_type)
        df.insert(0, "Target", name)
        df.insert(1, "Type", interact_type)
        data = pd.concat([data, df], ignore_index=True)
    
    time.sleep(1)
    
data


No SMILES found for:  https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=937
No SMILES found for:  https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=937
No SMILES found for:  https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=137
No SMILES found for:  https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=137


Unnamed: 0,Target,Type,Compound,Smiles,Species,Action,Value,Parameter
0,dopamine_d1,agonists,A77636,NCC1OC(Cc2c1ccc(c2O)O)C12CC3CC(C2)CC(C1)C3,Hs,Full agonist,8.7,pKi
1,dopamine_d1,agonists,SKF-75670,CN1CCc2c(C(C1)c1ccccc1)cc(c(c2)O)O,Rn,Full agonist,8.7,pKi
2,dopamine_d1,agonists,SKF-81297,Oc1c(O)cc2c(c1Cl)CCNCC2c1ccccc1,Rn,Full agonist,8.7,pKi
3,dopamine_d1,agonists,tavapadon,CC1=C(C=CC(=C1)OC2=NC=CC=C2C(F)(F)F)C3=C(C)C(=...,Hs,Partial agonist,8.1,pKi
4,dopamine_d1,agonists,dihydrexidine,Oc1cc2CCC3C(c2cc1O)c1ccccc1CN3,Rn,Full agonist,8.0,pKi
...,...,...,...,...,...,...,...,...
1148,serotonin_5ht2c,antagonists,spiperone,Fc1ccc(cc1)C(=O)CCCN1CCC2(CC1)C(=O)NCN2c1ccccc1,Hs,Antagonist,5.6 – 6.2,pKi
1149,serotonin_5ht2c,antagonists,spiramide,Fc1ccc(cc1)OCCCN1CCC2(CC1)C(=O)NCN2c1ccccc1,Hs,Antagonist,5.8,pKi
1150,serotonin_5ht2c,antagonists,SB 204741,O=C(Nc1snc(c1)C)Nc1ccc2c(c1)ccn2C,Hs,Antagonist,5.6,pKi
1151,serotonin_5ht2c,antagonists,AC-90179,COc1ccc(cc1)CC(=O)N(C1CCN(CC1)C)Cc1ccc(cc1)C,Hs,Inverse agonist,5.5,pKi


In [114]:
data.to_csv("../data/processed/iuphar_notgaba.csv", index=False)


In [15]:
import random
random.seed(42)

# Read B3DB data from 'A curated diverse molecular database of blood-brain barrier permeability with chemical descriptors'
data = pd.read_csv("../data/processed/B3DB_classification.tsv", delimiter="\t")
data = data[data['BBB+/BBB-'] == 'BBB+']
data.reset_index(drop=True, inplace=True)
data = data[['compound_name', 'SMILES']]
data['Interaction'] = 0

# Read labeled data from scraped pages
prev = pd.read_csv("../data/processed/iuphar_labeled.csv")
prev = prev[prev['Interaction'] == 1]
prev.reset_index(drop=True, inplace=True)


# Remove rows that correspond to molecules known to interact with GABA
data = data[~data['SMILES'].isin(prev['Smiles'])]
data.reset_index(drop=True, inplace=True)

# Sample n negative examples and created a diff labeled dataset
n = 1000
sampled_data = data.sample(n=n, random_state=42)
sampled_data.columns = ['Compound', 'Smiles', 'Interaction']
merged_data = pd.concat([sampled_data, prev], ignore_index=True)
merged_data.to_csv("../data/processed/iuphar_labeled2.csv", index=False)