# Download Kidney Datasets from HuBMAP

## Imports / functions

In [1]:
import json
import os
import requests
import warnings
from urllib.request import urlretrieve

In [2]:
def retrieve_files_remote(uuid, file_name, outdir='.'): 
    '''
    For a given UUID and file name, retrieve this file and save it locally.

    Parameters
    ----------
    uuid : str
        UUID of dataset
    file_name : str
        relative location of desired file. 
    outdir : str, optional
        name of output folder. Default: '.'
    '''
    url = 'https://assets.hubmapconsortium.org/' + uuid + '/' + file_name

    extension = str.split(file_name, sep='.')[-1]

    # check if relative file_name has multiple subfolders
    # if so, extract the folder structure without the filename as a string
    folder_structure = str.split(file_name, sep='/')[0:-1]
    folder_structure_addition = '/' + '/'.join(folder_structure) + '/' if len(folder_structure) > 0 else ''

    if not os.path.exists(outdir + '/' + uuid + '/' + file_name):
        if not os.path.exists(outdir + '/' + uuid + folder_structure_addition):
            os.makedirs(outdir + '/' + uuid + folder_structure_addition, exist_ok = True) # unlike os.mkdir, os.makedirs creates directories recursively
        urlretrieve(url, outdir + '/' + uuid + '/' + file_name)
        return 1
    else:
        return 0

In [3]:
search_api = "https://search.api.hubmapconsortium.org/v3/portal/search"

def get_uuids():
    '''
    get all uuids for the kidney datasets we want to use
    '''
    hits = json.loads(
        requests.post(
            search_api,
            json={
                "size": 10000,
                "query": {
                    "bool": {
                        "must": [
                            {"term":{ "files.rel_path.keyword": "expr.h5ad" }}
                        ],
                        "must_not":[{"exists":{"field":"next_revision_uuid"}},{"exists":{"field":"sub_status"}}],
                        "should":[
                            {"term":{"origin_samples.mapped_organ.keyword":"Kidney (Left)"}},
                            {"term":{"origin_samples.mapped_organ.keyword":"Kidney (Right)"}}
                        ],
                        "minimum_should_match" : 1,
                    }
                },
                "_source": { "includes": ["uuid"] },
            },
        ).text
    )["hits"]["hits"]
    return [ hit["_source"]["uuid"] for hit in hits ]

In [4]:
entity_api = "https://entity.api.hubmapconsortium.org/entities/"
ancestors_api = "https://entity.api.hubmapconsortium.org/ancestors/"

def save_dataset_metadata(uuid, outdir="."):
    dataset = json.loads(requests.get(entity_api + uuid).text)
    dataset["ancestors"] = json.loads(requests.get(ancestors_api + uuid).text)

    with open(outdir + '/' + uuid + '/dataset.json', mode='w') as f:
        f.write(json.dumps(dataset, indent=2))

    return dataset

In [5]:
uuids = get_uuids()
print(len(uuids), "kidney datasets with an h5ad file")

128 kidney datasets with an h5ad file


In [6]:
files_downloaded = 0
for uuid in uuids:
    files_downloaded += retrieve_files_remote(uuid, "expr.h5ad", "datasets")
    files_downloaded += retrieve_files_remote(uuid, "secondary_analysis.h5ad", "datasets")
print(files_downloaded, "files downloaded")

256 files downloaded


In [7]:
datasets = [ save_dataset_metadata(uuid, 'datasets') for uuid in uuids ]
print(len(datasets), "datasets' metadata saved")

128 datasets' metadata saved
