# RUI Reporter - SenNet Analysis

This notebook analyzes RUI location registration coverage for SenNet datasets using the SenNet APIs.

## List of Supported Reference Organs

In [4]:
import requests

# HRA API endpoint for reference organs
REFERENCE_ORGANS_URL = "https://apps.humanatlas.io/api/v1/reference-organs"

# Get all reference organs
response = requests.get(REFERENCE_ORGANS_URL)
organs = response.json()

# Extract and normalize the UBERON IDs from the API
def iri_to_curie(iri):
    if iri and "obo/UBERON_" in iri:
        return "UBERON:" + iri.split("_")[-1]
    return iri

reference_uberon_ids = {iri_to_curie(organ.get("representation_of")) for organ in organs if organ.get("representation_of")}

print(f"{len(organs)} Supported Reference Organs")
print(f"Extracted {len(reference_uberon_ids)} unique UBERON IDs")

73 Supported Reference Organs
Extracted 42 unique UBERON IDs


## Ratio of Registered/Total for SenNet

In [5]:
import requests

# --- Setup reference organs ---
# HRA API endpoint for reference organs
REFERENCE_ORGANS_URL = "https://apps.humanatlas.io/api/v1/reference-organs"

# Get all reference organs
response = requests.get(REFERENCE_ORGANS_URL)
organs = response.json()

# Extract and normalize the UBERON IDs from the API
def iri_to_curie(iri):
    if iri and "obo/UBERON_" in iri:
        return "UBERON:" + iri.split("_")[-1]
    return iri

reference_uberon_ids = {iri_to_curie(organ.get("representation_of")) for organ in organs if organ.get("representation_of")}

# --- User-provided SenNet API token ---
SENNET_TOKEN = "AgKKovd81aw6X1W9z0O5G0mMXyzln3zWja6qGE5dEXQwxyBOoHeC4Xo2Jkg6jOBed6kXB2xOBEPWXUDa61qOhb52Y6"

# SenNet API endpoints
SEARCH_API_URL = "https://search.api.sennetconsortium.org/search"
headers = {"Authorization": f"Bearer {SENNET_TOKEN}", "Content-Type": "application/json"}

# Get total count of all datasets
total_query = {
    "version": True,
    "size": 0,
    "track_total_hits": True,
    "query": {
        "bool": {
            "filter": [
                {"term": {"entity_type.keyword": "Dataset"}},
                {"term": {"creation_action.keyword": "Create Dataset Activity"}}
            ]
        }
    }
}

total_response = requests.post(SEARCH_API_URL, json=total_query, headers=headers)
total_datasets_count = total_response.json()['hits']['total']['value'] if total_response.status_code == 200 else 0

# Fetch ALL datasets using pagination
all_datasets = []
page_size = 1000  # Max page size
from_offset = 0

while True:
    datasets_query = {
        "version": True,
        "size": page_size,
        "from": from_offset,
        "_source": ["uuid", "sennet_id", "origin_samples.organ", "rui_location", "rui_locations", "ancestors.rui_location"],
        "query": {
            "bool": {
                "filter": [
                    {"term": {"entity_type.keyword": "Dataset"}},
                    {"term": {"creation_action.keyword": "Create Dataset Activity"}}
                ]
            }
        }
    }
    
    datasets_response = requests.post(SEARCH_API_URL, json=datasets_query, headers=headers)
    
    if datasets_response.status_code != 200:
        print(f"Error fetching datasets: {datasets_response.status_code}")
        break
    
    datasets_data = datasets_response.json()
    datasets_batch = datasets_data.get('hits', {}).get('hits', [])
    
    if not datasets_batch:
        break
    
    all_datasets.extend(datasets_batch)
    from_offset += page_size
    
    print(f"Fetched {len(all_datasets)} datasets so far...")
    
    # Stop if we got fewer than page_size results (last page)
    if len(datasets_batch) < page_size:
        break

print(f"Total fetched: {len(all_datasets)} datasets")

# Cross-match ALL datasets with reference organs
supported_count = 0
registered_count = 0

for dataset in all_datasets:
    source = dataset.get('_source', {})
    origin_samples = source.get('origin_samples', [])
    
    # Check if any origin_sample organ matches reference organs
    is_supported = False
    for sample in origin_samples:
        organ = sample.get('organ')
        if organ and organ in reference_uberon_ids:
            is_supported = True
            break
    
    if is_supported:
        supported_count += 1
        
        # Check for RUI location data
        has_rui_location = False
        
        # Check direct rui_location fields
        if source.get('rui_location') or source.get('rui_locations'):
            has_rui_location = True
        
        # Check ancestors array for rui_location
        ancestors = source.get('ancestors', [])
        for ancestor in ancestors:
            if ancestor.get('rui_location') or ancestor.get('rui_locations'):
                has_rui_location = True
                break
        
        if has_rui_location:
            registered_count += 1

print(f"Total datasets: {total_datasets_count}")
print(f"Supported datasets: {supported_count}")
print(f"Registered datasets: {registered_count}")

Fetched 1000 datasets so far...
Fetched 2000 datasets so far...
Fetched 2147 datasets so far...
Total fetched: 2147 datasets
Total datasets: 2147
Supported datasets: 1466
Registered datasets: 854
