Note: this notebook is set up to run with the env.yml containing the name 'polaris_datasets'

# Dataset creation with [Polaris](https://github.com/polaris-hub/polaris)
The first step of creating a benchmark is to set up a standard dataset which allows accessing the curated dataset (which has been demonstrated in <01_D3R_Cathepsin_C25S_data_curation.ipynb>), and all necessary information about the dataset such as data source, description of endpoints, units etc. 

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm
import numpy as np

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

In [2]:
# Get the owner and organization
org = "polaris"
data_name = "d3r_cathepsin_c25s"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org, type="organization")
owner

HubOwner(slug='polaris', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Load existing data

In [4]:
PATH = f"{gcp_root}/data/curation/{data_name}_curated.csv"
table = pd.read_csv(PATH)
table.columns

Index(['CMPD_ID', 'SMILES', 'AFFINITY', 'MOL_smiles', 'MOL_molhash_id',
       'MOL_molhash_id_no_stereo', 'MOL_num_stereoisomers',
       'MOL_num_undefined_stereoisomers', 'MOL_num_defined_stereo_center',
       'MOL_num_undefined_stereo_center', 'MOL_num_stereo_center',
       'MOL_undefined_E_D', 'MOL_undefined_E/Z', 'OUTLIER_AFFINITY',
       'AC_AFFINITY'],
      dtype='object')

### Below we specify the meta information of data columns

In [5]:
# Additional meta-data on the column level
annotations = {
    "MOL_smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    "AFFINITY": ColumnAnnotation(
        description="Affinity against a C25S Cathepsin S mutant."
    ),
}

### Define `Dataset` object

In [6]:
version = "v1"
dataset_name = f"{data_name}-{version}"

In [7]:
dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=table[annotations.keys()],
    # Additional meta-data on the dataset level.
    name=dataset_name,
    description="Drug Design Data Resource Grand Challenge 4 Dataset: CathepsinS",
    source="https://drugdesigndata.org/about/datasets/2028",
    annotations=annotations,
    tags=["Affinity"],
    owner=owner,
    license="CC-BY-SA-4.0",
    user_attributes={"year": "2020"},
    readme=load_readme("org-Polaris/d3r_cathepsin_c25s/D3R_Cathepsin_C25S_readme.md"),
    curation_reference="https://github.com/polaris-hub/polaris-recipes/org-Polaris/d3r_cathepsin_c25s/01_D3R_Cathepsin_C25S_data_curation.ipynb",
)

In [8]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

[32m2024-07-15 17:18:14.642[0m | [1mINFO    [0m | [36mpolaris._mixins[0m:[36mmd5sum[0m:[36m27[0m - [1mComputing the checksum. This can be slow for large datasets.[0m


'gs://polaris-public/polaris-recipes/org-polaris/d3r_cathepsin_c25s/datasets/d3r_cathepsin_c25s-v1/dataset.json'

In [9]:
from polaris.hub.client import PolarisHubClient

client = PolarisHubClient()
client.login()

client.upload_dataset(dataset=dataset, access="private", owner=owner)

[32m2024-07-15 17:18:16.154[0m | [1mINFO    [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m285[0m - [1mYou are already logged in to the Polaris Hub as  (lu@valencediscovery.com). Set `overwrite=True` to force re-authentication.[0m
[32m2024-07-15 17:18:18.036[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m631[0m - [32m[1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/polaris/d3r_cathepsin_c25s-v1[0m


{'id': 'mTX7sTqsiPGntpNcllEvf',
 'createdAt': '2024-07-15T21:18:16.755Z',
 'deletedAt': None,
 'name': 'd3r-cathepsin-c25s-v1',
 'slug': 'd3r-cathepsin-c25s-v1',
 'description': 'Drug Design Data Resource Grand Challenge 4 Dataset: CathepsinS',
 'tags': ['Affinity'],
 'userAttributes': {'year': '2020'},
 'access': 'private',
 'isCertified': False,
 'polarisVersion': 'dev',
 'readme': '## Background\nThe Drug Design Data Resource (D3R) aims to advance the technology of computer-aided drug discovery through the interchange of high quality protein-ligand datasets and workflows, and by holding community-wide, blinded prediction challenges. The D3R project is based at the University of California San Diego (UCSD), where it is co-directed by Drs. Rommie Amaro and Michael Gilson. An additional D3R component, focused on determining, validating and archiving protein-ligand co-crystal structures, is hosted at Rutgers the State University of New Jersey and led by Dr. Stephen K. Burley, who is Dir