Note: this notebook is set up to run with the env.yml containing the name 'polaris_datasets'

# Dataset creation with [Polaris](https://github.com/polaris-hub/polaris) 

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[3]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

In [2]:
# Get the owner and organization
org = "Graphium"
data_name = "l1000_mcf7"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org.lower(), type="organization")
owner

HubOwner(slug='graphium', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Load existing data

In [10]:
PATH = f"{gcp_root}/data/raw/LINCS_L1000_MCF7_0-4.csv"
table = pd.read_csv(PATH)
table.columns

Index(['full_id', 'pert_id', 'cell_iname', 'SMILES', 'inchi_key',
       'compound_aliases', 'geneID-10007', 'geneID-1001', 'geneID-10013',
       'geneID-10038',
       ...
       'geneID-9918', 'geneID-9924', 'geneID-9926', 'geneID-9928',
       'geneID-993', 'geneID-994', 'geneID-9943', 'geneID-9961', 'geneID-998',
       'geneID-9988'],
      dtype='object', length=984)

In [14]:
table.head(1)

Unnamed: 0,full_id,pert_id,cell_iname,SMILES,inchi_key,compound_aliases,geneID-10007,geneID-1001,geneID-10013,geneID-10038,...,geneID-9918,geneID-9924,geneID-9926,geneID-9928,geneID-993,geneID-994,geneID-9943,geneID-9961,geneID-998,geneID-9988
0,CRCGN004_MCF7_6H:BRD-K91960538-001-06-8:10,BRD-K91960538,MCF7,CN(C)[N+][O-],UMFJAHHVKNCGLG-UHFFFAOYSA-N,,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


### Below we specify the meta information of data columns

In [15]:
# Additional meta-data on the column level
annotations = {
    "full_id": ColumnAnnotation(description="full_id"),
    "pert_id": ColumnAnnotation(description="pert_id"),
    "cell_iname": ColumnAnnotation(description="Cell line name"),
    "SMILES": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    "compound_aliases": ColumnAnnotation(
        description="Molecule identifier",
    ),
    **{
        col: ColumnAnnotation(description=col, modality="molecule")
        for col in table.columns
        if col.startswith("geneID")
    },
}

### Define `Dataset` object

In [16]:
version = "v1"
dataset_name = f"{data_name}-{version}"

In [18]:
dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=table[annotations.keys()],
    # Additional meta-data on the dataset level.
    name=dataset_name,
    description="The LINCS L1000 is a database of high-throughput transcriptomics that screened more than 30,000 perturbations on a set of 978 landmark genes from human breast cancer cell line.",
    source="https://pubmed.ncbi.nlm.nih.gov/29195078/",
    annotations=annotations,
    tags=["LargeMix", "transcriptomics"],
    owner=owner,
    license="CC-BY-4.0",
    user_attributes={"year": "2017"},
    readme=load_readme(f"org-Graphium/l1000/{data_name}/readme.md"),
)

In [19]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

[32m2024-07-17 00:13:31.759[0m | [1mINFO    [0m | [36mpolaris._mixins[0m:[36mmd5sum[0m:[36m27[0m - [1mComputing the checksum. This can be slow for large datasets.[0m


'gs://polaris-public/polaris-recipes/org-Graphium/l1000_mcf7/datasets/l1000_mcf7-v1/dataset.json'

In [20]:
from polaris.hub.client import PolarisHubClient

client = PolarisHubClient()
client.login()

client.upload_dataset(dataset=dataset, access="private", owner=owner)

[32m2024-07-17 00:14:38.773[0m | [1mINFO    [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m285[0m - [1mYou are already logged in to the Polaris Hub as  (lu@valencediscovery.com). Set `overwrite=True` to force re-authentication.[0m
[32m2024-07-17 00:14:45.091[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m631[0m - [32m[1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/graphium/l1000_mcf7-v1[0m


{'id': '7C7RxULp5PcqQkiNRdoKK',
 'createdAt': '2024-07-17T04:14:39.817Z',
 'deletedAt': None,
 'name': 'l1000-mcf7-v1',
 'slug': 'l1000-mcf7-v1',
 'description': 'The LINCS L1000 is a database of high-throughput transcriptomics that screened more than 30,000 perturbations on a set of 978 landmark genes from human breast cancer cell line.',
 'tags': ['LargeMix', 'transcriptomics'],
 'userAttributes': {'year': '2017'},
 'access': 'private',
 'isCertified': False,
 'polarisVersion': 'dev',
 'readme': '## Background\nZINC12K is a well-known dataset for researchers in GNN expressivity. We include it in our ToyMix since GNN expressivity is very important for performance on large-scale data. Hence, we hope that the performance on this task will correlate well with the performance when scaling.\n\n## Assay information\n\n\n## Description of readout:\n\n\n## Data resource\n\n',
 'ownerId': 'zMTB7lQiiukqEmLQF7EjT',
 'creatorId': 'NKnaHGybLqwSHcaMEHqfF',
 'state': 'upload_pending',
 'source': 'ht