## List of Supported Reference Organs

In [1]:
import requests

# HRA API endpoint for reference organs
REFERENCE_ORGANS_URL = "https://apps.humanatlas.io/api/v1/reference-organs"

# Get all reference organs
response = requests.get(REFERENCE_ORGANS_URL)
organs = response.json()

# Extract the uberon values for all reference organs
uberon_ids = [organ["representation_of"] for organ in organs]

print(f"{len(organs)} Supported Reference Organs")



73 Supported Reference Organs


## Ratio of Registered/Total for HuBMAP

In [8]:
import requests
import re
from datetime import datetime
import os
import csv


# --- User-provided HuBMAP API token ---
HUBMAP_TOKEN = "Ag18QgjdyzJ8g5K8DGjQXdQK798Pg8xgambx8lKzVE8zya3qnVf2C6okv43vgJEr4opelP9j5bVBxYt9g555qFMx0rN"

# --- Step 1: Extract code_to_uberon mapping from metadata.js ---
metadata_path = "/Users/dequeue/Desktop/RUI.nosync/hra-registrations/scripts/metadata.js"
code_to_uberon = {}

with open(metadata_path, "r") as f:
    content = f.read()
    matches = re.findall(r"(\w+):\s*{\s*code: '(\w+)',\s*label: '[^']+',\s*organ_id: '([^']+)'", content)
    for code, _, uberon in matches:
        code_to_uberon[code] = uberon

# --- Step 2: Get all reference organs and their UBERON IDs ---
REFERENCE_ORGANS_URL = "https://apps.humanatlas.io/api/v1/reference-organs"
response = requests.get(REFERENCE_ORGANS_URL)
organs = response.json()

# Extract and normalize the UBERON IDs from the API
def iri_to_curie(iri):
    if iri and "obo/UBERON_" in iri:
        return "UBERON:" + iri.split("_")[-1]
    return iri

reference_uberon_ids = {iri_to_curie(organ.get("representation_of")) for organ in organs if organ.get("representation_of")}

# --- Step 3: Filter supported codes to only those whose UBERON IDs are in the reference set ---
filtered_supported_codes = [code for code, uberon in code_to_uberon.items() if uberon in reference_uberon_ids]

print(filtered_supported_codes)

SEARCH_API_URL = "https://search.api.hubmapconsortium.org/v3/search"
headers = {"Authorization": f"Bearer {HUBMAP_TOKEN}"}

# 1. Total count of all datasets
total_query = {
    "query": {
        "bool": {
            "filter": [
                {"match": {"entity_type.keyword": "Dataset"}}
            ]
        }
    },
    "size": 0
}

# 2. Count of datasets where organ matches filtered_supported_codes
organ_query = {
    "query": {
        "bool": {
            "filter": [
                {"match": {"entity_type.keyword": "Dataset"}},
                {"terms": {"origin_samples.organ.keyword": filtered_supported_codes}},
                {"match": {"origin_samples.sample_category.keyword": "organ"}}
            ]
        }
    },
    "size": 0
}

# Get total count
total_datasets_hubmap = requests.post(SEARCH_API_URL, json=total_query, headers=headers)
total_datasets_count_hubmap = total_datasets_hubmap.json()['hits']['total']['value']

# Get organ-matched count
supported_datasets_hubmap = requests.post(SEARCH_API_URL, json=organ_query, headers=headers)
supported_datasets_count_hubmap = supported_datasets_hubmap.json()['hits']['total']['value']

print(f"Total datasets: {total_datasets_count_hubmap}")
print(f"Supported datasets: {supported_datasets_count_hubmap}")

# 3. Count of datasets with organ in filtered_supported_codes AND rui_location present in any ancestor
organ_with_rui_query = {
    "query": {
        "bool": {
            "filter": [
                {"match": {"entity_type.keyword": "Dataset"}},
                {"terms": {"origin_samples.organ.keyword": filtered_supported_codes}},
                {"match": {"origin_samples.sample_category.keyword": "organ"}},
                {"exists": {"field": "ancestors.rui_location"}}
            ]
        }
    },
    "size": 0
}

# Get count of datasets with rui_location in ancestors
datasets_with_rui_hubmap = requests.post(SEARCH_API_URL, json=organ_with_rui_query, headers=headers)
datasets_with_rui_hubmap_count = datasets_with_rui_hubmap.json()['hits']['total']['value']

print(f"Registered datasets: {datasets_with_rui_hubmap_count}")

# 4. Find datasets with organ in filtered_supported_codes and NO rui_location in any ancestor
# Query for datasets with organ in filtered_supported_codes and NO rui_location in any ancestor,
from collections import Counter

datasets_without_rui_query = {
    "query": {
        "bool": {
            "filter": [
                {"match": {"entity_type.keyword": "Dataset"}},
                {"terms": {"origin_samples.organ.keyword": filtered_supported_codes}},
                {"match": {"origin_samples.sample_category.keyword": "organ"}}
            ],
            "must_not": [
                {"exists": {"field": "ancestors.rui_location"}}
            ]
        }
    },
    "_source": ["ancestors"],
    "size": 10000  # Increase if needed to capture all relevant datasets
}

datasets_without_rui_hubmap = requests.post(SEARCH_API_URL, json=datasets_without_rui_query, headers=headers)
datasets = datasets_without_rui_hubmap.json()['hits']['hits']


# 5. Output the three main counts to CSV
csv_path = "/Users/dequeue/Desktop/RUI.nosync/hra-registrations/scripts/hubmap_counts.csv"
now = datetime.now().strftime("%Y-%m-%d %H:%M")

# Prepare the new column data
new_column = [total_datasets_count_hubmap, supported_datasets_count_hubmap, datasets_with_rui_hubmap_count]

# Check if file exists
file_exists = os.path.isfile(csv_path)

if not file_exists:
    # Create file with row labels and first column
    with open(csv_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["", "Total Datasets", "Supported Datasets", "Registered Datasets"])
        writer.writerow([now] + new_column)
else:
    # Read existing data
    with open(csv_path, "r", newline="") as f:
        rows = list(csv.reader(f))
    # If the first column is empty, fill it with row labels
    if rows and rows[0][0] == "":
        for i, label in enumerate(["Total Datasets", "Supported Datasets", "Registered Datasets"], start=1):
            if len(rows) <= i:
                rows.append([label])
            else:
                rows[i][0] = label
    # Append new column to each row
    if len(rows) < 4:
        # Ensure there are 4 rows (header + 3 data)
        while len(rows) < 4:
            rows.append([""])
    rows[0].append(now)
    for i, val in enumerate(new_column, start=1):
        rows[i].append(val)
    # Write back to file
    with open(csv_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerows(rows)

# 6. Find and print the top 10 unregistered blocks by dataset count

# Query all samples and get their uuid and sample_category
samples_query = {
    "query": {
        "bool": {
            "filter": [
                {"match": {"entity_type.keyword": "Sample"}}
            ]
        }
    },
    "_source": ["uuid", "sample_category"],
    "size": 10000  # Increase if needed
}

samples_resp = requests.post(SEARCH_API_URL, json=samples_query, headers=headers)
sample_hits = samples_resp.json()['hits']['hits']

# Build the mapping
sample_category_map = {}
for hit in sample_hits:
    src = hit['_source']
    uuid = src.get('uuid')
    cat = src.get('sample_category', '')
    if uuid:
        sample_category_map[uuid] = cat.lower()


ancestor_id_counter = Counter()

for ds in datasets:
    for ancestor in ds['_source'].get('ancestors', []):
        uuid = ancestor.get('uuid')
        if uuid and sample_category_map.get(uuid) == 'block':
            ancestor_id_counter[uuid] += 1

print("Top 10 blocks by dataset count that lack rui_location:")
for uuid, count in ancestor_id_counter.most_common(10):
     print(f"https://portal.hubmapconsortium.org/browse/sample/{uuid}: {count}")




['BL', 'BR', 'HT', 'LE', 'LI', 'LK', 'LV', 'PA', 'RE', 'RK', 'SI', 'SK', 'SP', 'TH', 'TR', 'UT']
Total datasets: 9751
Supported datasets: 6366
Registered datasets: 4438
Top 10 blocks by dataset count that lack rui_location:
https://portal.hubmapconsortium.org/browse/sample/a4c30e6b188093a5ad0bea83ce7249c0: 49
https://portal.hubmapconsortium.org/browse/sample/5f27b70a8f50ef38970905020ac24bc3: 49
https://portal.hubmapconsortium.org/browse/sample/33cb9c1e2e3cac4eb39bacb688a5834c: 24
https://portal.hubmapconsortium.org/browse/sample/b07c38f5497330900cd74effb1468aa5: 19
https://portal.hubmapconsortium.org/browse/sample/a77d4bc5656ce4051efc6317f51cf715: 16
https://portal.hubmapconsortium.org/browse/sample/abef90ab7a7becada62c5d7bdda8a025: 16
https://portal.hubmapconsortium.org/browse/sample/c1ec99d9f5da2d64fd7d6d2991604ce8: 16
https://portal.hubmapconsortium.org/browse/sample/0129caf0e66fe67754475dc71e6706de: 8
https://portal.hubmapconsortium.org/browse/sample/ab298067e1f49f85f28444c1b46f476