This notebook shows how to retrieve all hubmap datasets and filter these down to only the datasets that have a secondary_analysis.h5ad file with CLID annotation

In [1]:
# !pip install requests pandas zarr

In [2]:
import requests
from csv import DictReader, excel_tab
from io import StringIO
import json
import requests

import pandas as pd
import zarr

In [3]:
## get all hubmap datasets
response = requests.get(
    'https://portal.hubmapconsortium.org/metadata/v0/datasets.tsv'
)
metadata = list(DictReader(StringIO(response.text), dialect=excel_tab))
uuids = list(pd.DataFrame(metadata[1:])['uuid'])

In [4]:
len(uuids)

2414

In [5]:
# query the search api for files
search_api = 'https://search.api.hubmapconsortium.org/v3/portal/search'

# warning: the uuids need to be chunked, you can't add all >2000 uuids in one post request

hits = json.loads(
    requests.post(
        search_api,
        json={
            "size": 10000,  # To make sure the list is not truncted, set this high.
            "query": {"ids": {"values": uuids[0:500]}},
            "_source": ["files"],
        },  # Documents are large, so only request the fields we need.
    ).text
)["hits"]["hits"]

# filter for datasets that have at least one file
hits = [h for h in hits if len(h['_source']['files']) > 0]

# filter for datasets that have the secondary_analysis.h5ad file
hits_secondary = [h for h in hits if 'secondary_analysis.h5ad' in map(lambda d: d['rel_path'], h['_source']['files'])]
uuids_secondary = [h['_id'] for h in hits_secondary]

In [6]:
len(uuids_secondary)

72

In [7]:
# for each dataset, check if it has the 'predicted_CLID' as a column in the obs dataframe
# this takes about a minute for 75 entries
uuids_clid = []
for uuid in uuids_secondary:
    try: 
        # get the zarr_url for this dataset and file
        zarr_url = f'https://assets.hubmapconsortium.org/{uuid}/hubmap_ui/anndata-zarr/secondary_analysis.zarr'
        obs_root = zarr.open(zarr_url + "/obs")
        obs_columns = obs_root.attrs['column-order']
        if 'predicted_CLID' in obs_columns: 
            uuids_clid.append(uuid)
    except: 
        break

In [None]:
uuids_clid