# PKIS 1 Dataset creation

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Get the owner and organization
org = "polaris"
data_name = "drewry2014_pkis1_subset"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org, type="organization")
owner

HubOwner(slug='polaris', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Load existing data

In [4]:
PATH = f"{gcp_root}/data/curation/{data_name}_curated.csv"
table = pd.read_csv(PATH)
table.columns

Index(['Smiles', 'EGFR_L858R', 'EGFR', 'KIT_T6701', 'KIT_V560G', 'KIT',
       'RET_V804L', 'RET_Y791F', 'RET', 'MOL_smiles', 'MOL_molhash_id',
       'MOL_molhash_id_no_stereo', 'MOL_num_stereoisomers',
       'MOL_num_undefined_stereoisomers', 'MOL_num_defined_stereo_center',
       'MOL_num_undefined_stereo_center', 'MOL_num_stereo_center',
       'MOL_undefined_E_D', 'MOL_undefined_E/Z', 'CLS_EGFR', 'CLS_EGFR_L858R',
       'CLS_KIT', 'CLS_KIT_T6701', 'CLS_KIT_V560G', 'CLS_RET', 'CLS_RET_V804L',
       'CLS_RET_Y791F', 'OUTLIER_EGFR_L858R', 'OUTLIER_EGFR',
       'OUTLIER_KIT_T6701', 'OUTLIER_KIT_V560G', 'OUTLIER_KIT',
       'OUTLIER_RET_V804L', 'OUTLIER_RET_Y791F', 'OUTLIER_RET',
       'AC_CLS_EGFR_L858R', 'AC_CLS_EGFR', 'AC_CLS_KIT_T6701',
       'AC_CLS_KIT_V560G', 'AC_CLS_KIT', 'AC_CLS_RET_V804L',
       'AC_CLS_RET_Y791F', 'AC_CLS_RET'],
      dtype='object')

## Below we specify the meta information of data columns

### Create Dataset

In [5]:
data_cols = [
    "EGFR_L858R",
    "EGFR",
    "KIT_T6701",
    "KIT_V560G",
    "KIT",
    "RET_V804L",
    "RET_Y791F",
    "RET",
]

threshold_dict = {
    "EGFR_L858R": 75,
    "EGFR": 75,
    "KIT_T6701": 75,
    "KIT_V560G": 75,
    "KIT": 75,
    "RET_V804L": 70,
    "RET_Y791F": 70,
    "RET": 70,
}

#### Create annotations for data columns

In [6]:
import re

mutant_anno = {}
mutant_cls_anno = {}

for col in data_cols:
    tar = col.split("_")[0]
    mut_var = re.findall(r"_\((\S+)\)", col)
    mut_var = "NaN" if len(mut_var) == 0 else mut_var[0]
    if mut_var is None:
        anno = ColumnAnnotation(
            description=f"Percentage of inhibition on {tar} wide type",
            user_attributes={
                "unit": "%",
                "concentration": "1uM",
                "organism": "Human",
                "objective": "Higher value",
                "source": "PKIS1",
                "mutation": "NaN",
                "target": tar,
            },
        )
        cls_anno = ColumnAnnotation(
            protocol=f"Binarized label based on the percentage of inhibition on {tar} wide type",
            user_attributes={
                "thresholds": f"Greather than {threshold_dict[col]}",
                "label_order": "ascending",
                "ref_col": col,
            },
        )
    else:
        anno = ColumnAnnotation(
            description=f"Percentage of inhibition on {tar} with mutation {mut_var}",
            user_attributes={
                "unit": "%",
                "concentration": "1uM",
                "organism": "Human",
                "objective": "Higher value",
                "source": "PKIS1",
                "mutation": mut_var,
                "target": tar,
            },
        )

        cls_anno = ColumnAnnotation(
            description=f"Binarized label based on the percentage of inhibition on {tar} {mut_var}",
            user_attributes={
                "thresholds": f"Greather than {threshold_dict[col]}",
                "label_order": "ascending",
                "ref_col": col,
            },
        )
    mutant_anno[col] = anno
    mutant_cls_anno[f"CLS_{col}"] = cls_anno

In [7]:
pkis_annotations = {
    # Molecule identifiers
    "MOL_molhash_id": ColumnAnnotation(
        description="Molecular hash ID. See <datamol.mol.hash_mol>"
    ),
    "MOL_smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    **mutant_anno,
    **mutant_cls_anno,
}

In [8]:
version = "v3"
dataset_name = f"drewry2014_pkis1_subset-{version}"
dataset = Dataset(
    table=table[pkis_annotations.keys()],
    name=dataset_name,
    description=f"A subset of PKIS dataset only including EGFR, RET, KIT kinases. PKIS is a data set of 367 small-molecule ATP-competitive kinase inhibitors that was screened by the set in activity assays with 224 recombinant kinases and 24 G protein-coupled receptors and in cellular assays of cancer cell proliferation and angiogenesis.",
    source="https://pubmed.ncbi.nlm.nih.gov/24283969/",
    annotations=pkis_annotations,
    owner=owner,
    tags=["Kinase", "HitDiscovery", "Selectivity"],
    user_attributes={"year": "2014"},
    readme=load_readme("org-Polaris/drewry2014_pkis1_subset/pkis1_subset_readme.md"),
    license="CC-BY-4.0",
    curation_reference="https://github.com/polaris-hub/polaris-recipes/org-Polaris/drewry2014_pkis1_subset/01_pkis1_kinase_data_curation.ipynb",
)

In [9]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

'gs://polaris-public/polaris-recipes/org-polaris/drewry2014_pkis1_subset/datasets/drewry2014_pkis1_subset-v3/dataset.json'

In [10]:
# upload to Polaris Hub
from polaris.hub.client import PolarisHubClient
client = PolarisHubClient()
client.login()

client.upload_dataset(dataset=dataset, access="private", owner=owner)

[32m2024-07-30 12:13:29.686[0m | [1mINFO    [0m | [36mpolaris.hub.external_auth_client[0m:[36minteractive_login[0m:[36m135[0m - [1mYour browser has been opened to visit:
https://clerk.polarishub.io/oauth/authorize?response_type=code&client_id=agQP2xVM6JqMHvGc&redirect_uri=https%3A%2F%2Fpolarishub.io%2Foauth2%2Fcallback&scope=profile+email&state=ENcF32TILoo8kejdjDCpMB2pcDiNVa&code_challenge=jcYknU_5iUp1c1G1lkOMP-S8N4P_FTVhME9dV3yMC_U&code_challenge_method=S256
[0m
[32m2024-07-30 12:14:16.146[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.external_auth_client[0m:[36minteractive_login[0m:[36m146[0m - [32m[1mSuccessfully authenticated to the Polaris Hub as `lu@valencediscovery.com`! 🎉[0m
[32m2024-07-30 12:14:16.807[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m260[0m - [32m[1mYou are successfully logged in to the Polaris Hub.[0m


✅ SUCCESS: [1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/polaris/drewry2014_pkis1_subset-v3[0m
 


  self._color = self._set_color(value) if value else value


{'id': 'mody8UPVaDhRcmv8zcGo9',
 'createdAt': '2024-07-30T16:14:17.020Z',
 'deletedAt': None,
 'name': 'drewry2014_pkis1_subset-v3',
 'slug': 'drewry2014-pkis1-subset-v3',
 'description': 'A subset of PKIS dataset only including EGFR, RET, KIT kinases. PKIS is a data set of 367 small-molecule ATP-competitive kinase inhibitors that was screened by the set in activity assays with 224 recombinant kinases and 24 G protein-coupled receptors and in cellular assays of cancer cell proliferation and angiogenesis.',
 'tags': ['Kinase', 'HitDiscovery', 'Selectivity'],
 'userAttributes': {'year': '2014'},
 'access': 'private',
 'isCertified': False,
 'polarisVersion': '0.7.9',
 'readme': '![kinase](https://storage.googleapis.com/polaris-public/icons/icons8-fox-60-kinases.png)\n\n## Background:\n Kinases play a crucial role in cellular signalling, making them important targets for drug development. Dysregulation of kinases is frequently implicated in diseases like cancer, inflammation, and neurodeg