# Dataset creation for Drugs75K

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm

import platformdirs

from polaris.utils.types import HubOwner
from polaris.dataset import Dataset, ColumnAnnotation

from polaris.dataset import DatasetFactory, create_dataset_from_file
from polaris.dataset.converters import SDFConverter

root = pathlib.Path("__file__").absolute().parents[3]
# set to recipe root directory
os.chdir(root)
sys.path.insert(0, str(root))

In [2]:
# Get the owner and organization
org = "Polaris"
data_name = "marcel/drug75k"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org.lower(), type="organization")
owner

HubOwner(slug='polaris', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Create dataset

In [4]:
# Load SDF file
PATH = "/Users/lu.zhu/Downloads/Drugs/Drugs.sdf"

In [5]:
# cache directory 
SAVE_DIR = dm.fs.join(platformdirs.user_cache_dir(appname="polaris-recipes"), "drugs75k")

save_dst = dm.fs.join(SAVE_DIR, "data.zarr")
! rm -r {save_dst}

factory = DatasetFactory(zarr_root_path=save_dst)
factory.register_converter(
    "sdf",
    SDFConverter(
        smiles_column=None,
        mol_prop_as_cols=True,
        split=True,
        mol_column="conformer",
        n_jobs=-1,
        max_num_mols= 5000
    ),
)
# Process your SDF file
factory.add_from_file(PATH)
dataset = factory.build()

rm: /Users/lu.zhu/Library/Caches/polaris-recipes/drugs75k/data.zarr: No such file or directory


[32m2024-07-21 02:57:47.718[0m | [1mINFO    [0m | [36mpolaris.dataset.converters._sdf[0m:[36mconvert[0m:[36m89[0m - [1mNumber of SDFs: 5000[0m


  0%|          | 0/5000 [00:00<?, ?it/s]

[32m2024-07-21 02:58:42.851[0m | [1mINFO    [0m | [36mpolaris.dataset.converters._sdf[0m:[36mconvert[0m:[36m116[0m - [1mLoaded 5000 SDFs.[0m


In [7]:
dataset.table.head(5)

Unnamed: 0,ID,name,smiles,energy,ip,ea,chi,eta,omega,conformer
0,mol32001_0_2,mol32001,COC(=O)[C@@]1(Cc2ccc(OC)cc2)[C@H]2c3cc(C(=O)N(...,-63912.220498,2.877388,3.082858,2.980123,-0.102735,-43.223413,conformer#0
0,mol32001_0_5,mol32001,COC(=O)[C@@]1(Cc2ccc(OC)cc2)[C@H]2c3cc(C(=O)N(...,-63912.187483,2.967116,3.287155,3.127135,-0.16002,-30.555553,conformer#0
0,mol32001_0_18,mol32001,COC(=O)[C@@]1(Cc2ccc(OC)cc2)[C@H]2c3cc(C(=O)N(...,-63912.137821,3.385755,2.941067,3.163411,0.222344,22.503766,conformer#0
0,mol32001_0_3,mol32001,COC(=O)[C@@]1(Cc2ccc(OC)cc2)[C@H]2c3cc(C(=O)N(...,-63912.038996,2.85632,2.704596,2.780458,0.075862,50.953975,conformer#0
0,mol32001_0_25,mol32001,COC(=O)[C@@]1(Cc2ccc(OC)cc2)[C@H]2c3cc(C(=O)N(...,-63912.012402,2.932461,2.437161,2.684811,0.24765,14.553218,conformer#0


### Below we specify the meta information of data columns

In [8]:
# Additional meta-data on the column level
annotations = {
    "ID": ColumnAnnotation(description="Molecule conformer identifier"),
    "name": ColumnAnnotation(description="Molecule identifier"),
    "smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    "conformer": ColumnAnnotation(description="Conformer pointer to the zarr file."),
    "energy": ColumnAnnotation(description="Conformer-level property energy."),
    "ip": ColumnAnnotation(description="Conformer-level property ip ."),
    "ea": ColumnAnnotation(description="Conformer-level property ea."),
    "chi": ColumnAnnotation(description="Conformer-level property chi."),
    "eta": ColumnAnnotation(description="Conformer-level property eta."),
    "omega": ColumnAnnotation(description="Conformer-level property omega."),
}

In [12]:
dataset.name = "drugs5k"
dataset.owner = owner
dataset.description = "Drugs-75K is a subset of the GEOM-Drugs dataset, which includes 75,099 molecules with at least 5 rotatable bonds."
dataset.annotations = annotations
dataset.tags = ["conformer", "3D"]
dataset.license = "CC-BY-4.0"
dataset.user_attributes = {"year": "2022"}
# dataset.readme = str(load_readme("org-Polaris/marcel/drugs-75k/readme.md"))
dataset.source = "https://arxiv.org/abs/2310.00115"

In [38]:
from polaris.dataset import create_dataset_from_file

# Because Polaris might restructure the Zarr archive, 
# we need to specify a location to save the Zarr file to.
del dataset
dataset = create_dataset_from_file(save_dst, zarr_root_path=dm.fs.join(SAVE_DIR, "zarr", "processed.zarr"))


In [13]:
from polaris.hub.client import PolarisHubClient

client = PolarisHubClient()
client.login()

[32m2024-07-21 03:00:33.994[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m225[0m - [32m[1mYou are successfully logged in to the Polaris Hub.[0m


In [14]:
client.upload_dataset(dataset=dataset, owner="polaris", timeout=1000)

⠙ Uploading dataset... {'_cached': True, '_intrans': False, '_transaction': None, '_invalidated_caches_in_transaction': [], 'dircache': <fsspec.dircache.DirCache object at 0x2f9e12bd0>, '_fs_token_': '65eedb3cf242f19b0beb14c36c6adb6d', 'polaris_client': <polaris.hub.client.PolarisHubClient object at 0x177a43830>, 'default_timeout': 1000, 'prefix': 'dataset/polaris/drugs5k/', 'base_path': '/storage/dataset/polaris/drugs5k', 'storage_args': (), 'storage_options': {'polaris_client': <polaris.hub.client.PolarisHubClient object at 0x177a43830>, 'dataset_owner': HubOwner(slug='polaris', external_id=None, type=None), 'dataset_name': 'drugs5k'}}
⠼ Uploading dataset... 

[32m2024-07-21 03:00:53.152[0m | [1mINFO    [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m602[0m - [1mCopying Zarr archive to the Hub. This may take a while.[0m


💥 ERROR: Failed to upload dataset. 


  self._color = self._set_color(value) if value else value


KeyboardInterrupt: 

In [36]:
client.open_zarr_file(
    path="/Users/lu.zhu/Documents/Codebase/ValenceLab/polaris-recipes/org-Polaris/marcel/drugs-75k/drugs75k_dataset/data.zarr",
    owner="polaris",
    name=dataset.name,
    mode="r",
)

PolarisHubError: Error opening Zarr store

In [17]:
import polaris as po

# Load the dataset from the Hub
dataset = po.load_dataset("polaris/drugs75k-test")

# Get information on the dataset size
dataset.size()

# Load a datapoint in memory
dataset.get_data(
    row=dataset.rows[0],
    col=dataset.columns[0],
)

# Or, similarly:
dataset[dataset.rows[0], dataset.columns[0]]

# Get the first 10 rows in memory
dataset[:10]



PolarisHubError: Error opening Zarr store