# RUI Registration Daily Tracker - HuBMAP & SenNet Combined Analysis

This notebook tracks RUI location registration coverage for both HuBMAP and SenNet datasets over time. It generates daily counts for:
- Total datasets
- Supported datasets (in organs covered by reference anatomy)
- Registered datasets (with RUI locations)

Results are saved to a CSV file with dates as columns and metrics as rows for time-series analysis.

## 1. Import Required Libraries

In [None]:
import requests
import re
import csv
import os
from datetime import datetime
from collections import Counter
import pandas as pd

print("Libraries imported successfully")
print(f"Current date/time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 2. Set API Tokens and Endpoints

In [None]:
# API Tokens (Replace with your actual tokens)
HUBMAP_TOKEN = "xyz"
SENNET_TOKEN = "xyz"  # Replace with actual SenNet token

# API Endpoints
HUBMAP_SEARCH_API_URL = "https://search.api.hubmapconsortium.org/v3/search"
SENNET_SEARCH_API_URL = "https://search.api.sennetconsortium.org/v3/search"
REFERENCE_ORGANS_URL = "https://apps.humanatlas.io/api/v1/reference-organs"

# Headers for API calls
hubmap_headers = {"Authorization": f"Bearer {HUBMAP_TOKEN}"}
sennet_headers = {"Authorization": f"Bearer {SENNET_TOKEN}"}

# Output CSV file path
CSV_OUTPUT_PATH = "/Users/dequeue/Desktop/RUI.nosync/hra-registrations/scripts/combined_daily_counts.csv"

print("API configuration completed")
print(f"HuBMAP API: {HUBMAP_SEARCH_API_URL}")
print(f"SenNet API: {SENNET_SEARCH_API_URL}")
print(f"Output CSV: {CSV_OUTPUT_PATH}")

## 3. Extract code_to_uberon Mapping from metadata.js

In [None]:
# Path to metadata.js file
metadata_path = "/Users/dequeue/Desktop/RUI.nosync/hra-registrations/scripts/RUI Reporter/metadata.js"

# Extract code_to_uberon mapping from metadata.js
code_to_uberon = {}

try:
    with open(metadata_path, "r") as f:
        content = f.read()
        # Parse the organ mappings using regex
        matches = re.findall(r"(\w+):\s*{\s*code: '(\w+)',\s*label: '[^']+',\s*organ_id: '([^']+)'", content)
        for code, _, uberon in matches:
            code_to_uberon[code] = uberon
    
    print(f"Successfully extracted {len(code_to_uberon)} organ code mappings")
    print("Sample mappings:", dict(list(code_to_uberon.items())[:5]))
    
except FileNotFoundError:
    print(f"Warning: metadata.js file not found at {metadata_path}")
    print("Using empty mapping - this may result in no supported datasets")
    code_to_uberon = {}
except Exception as e:
    print(f"Error parsing metadata.js: {e}")
    code_to_uberon = {}

## 4. Fetch Reference Organs and Normalize UBERON IDs

In [None]:
# Function to normalize UBERON IDs from IRI format to CURIE format
def iri_to_curie(iri):
    """Convert UBERON IRI to CURIE format (e.g., UBERON:0001234)"""
    if iri and "obo/UBERON_" in iri:
        return "UBERON:" + iri.split("_")[-1]
    return iri

# Fetch all reference organs from HRA API
try:
    response = requests.get(REFERENCE_ORGANS_URL)
    response.raise_for_status()
    organs = response.json()
    
    # Extract and normalize UBERON IDs
    reference_uberon_ids = {
        iri_to_curie(organ.get("representation_of")) 
        for organ in organs 
        if organ.get("representation_of")
    }
    
    print(f"Successfully fetched {len(organs)} reference organs")
    print(f"Extracted {len(reference_uberon_ids)} unique UBERON IDs")
    print("Sample UBERON IDs:", list(reference_uberon_ids)[:5])
    
except requests.RequestException as e:
    print(f"Error fetching reference organs: {e}")
    reference_uberon_ids = set()
except Exception as e:
    print(f"Unexpected error: {e}")
    reference_uberon_ids = set()

## 5. Filter Supported Codes for HuBMAP and SenNet

In [None]:
# Filter organ codes to only those whose UBERON IDs are in the reference set
filtered_supported_codes = [
    code for code, uberon in code_to_uberon.items() 
    if uberon in reference_uberon_ids
]

print(f"Total organ codes in metadata: {len(code_to_uberon)}")
print(f"Supported organ codes (in reference anatomy): {len(filtered_supported_codes)}")
print(f"Supported codes: {filtered_supported_codes}")

# This list will be used for both HuBMAP and SenNet queries
# as both consortia use the same organ code system

## 6. Query and Count Datasets for HuBMAP

In [None]:
def query_hubmap_datasets():
    """Query HuBMAP API for dataset counts"""
    
    # 1. Total count of all datasets
    total_query = {
        "query": {
            "bool": {
                "filter": [
                    {"match": {"entity_type.keyword": "Dataset"}}
                ]
            }
        },
        "size": 0
    }
    
    # 2. Count of datasets where organ matches supported codes
    supported_query = {
        "query": {
            "bool": {
                "filter": [
                    {"match": {"entity_type.keyword": "Dataset"}},
                    {"terms": {"origin_samples.organ.keyword": filtered_supported_codes}},
                    {"match": {"origin_samples.sample_category.keyword": "organ"}}
                ]
            }
        },
        "size": 0
    }
    
    # 3. Count of datasets with organ in supported codes AND rui_location present
    registered_query = {
        "query": {
            "bool": {
                "filter": [
                    {"match": {"entity_type.keyword": "Dataset"}},
                    {"terms": {"origin_samples.organ.keyword": filtered_supported_codes}},
                    {"match": {"origin_samples.sample_category.keyword": "organ"}},
                    {"exists": {"field": "ancestors.rui_location"}}
                ]
            }
        },
        "size": 0
    }
    
    try:
        # Execute queries
        total_resp = requests.post(HUBMAP_SEARCH_API_URL, json=total_query, headers=hubmap_headers)
        supported_resp = requests.post(HUBMAP_SEARCH_API_URL, json=supported_query, headers=hubmap_headers)
        registered_resp = requests.post(HUBMAP_SEARCH_API_URL, json=registered_query, headers=hubmap_headers)
        
        # Extract counts
        total_count = total_resp.json()['hits']['total']['value']
        supported_count = supported_resp.json()['hits']['total']['value']
        registered_count = registered_resp.json()['hits']['total']['value']
        
        print("HuBMAP Dataset Counts:")
        print(f"  Total datasets: {total_count}")
        print(f"  Supported datasets: {supported_count}")
        print(f"  Registered datasets: {registered_count}")
        
        return total_count, supported_count, registered_count
        
    except Exception as e:
        print(f"Error querying HuBMAP API: {e}")
        return 0, 0, 0

# Execute HuBMAP queries
hubmap_total, hubmap_supported, hubmap_registered = query_hubmap_datasets()

## 7. Query and Count Datasets for SenNet

In [None]:
def query_sennet_datasets():
    """Query SenNet API for dataset counts"""
    
    # Same query structure as HuBMAP but using SenNet endpoint
    # 1. Total count of all datasets
    total_query = {
        "query": {
            "bool": {
                "filter": [
                    {"match": {"entity_type.keyword": "Dataset"}}
                ]
            }
        },
        "size": 0
    }
    
    # 2. Count of datasets where organ matches supported codes  
    supported_query = {
        "query": {
            "bool": {
                "filter": [
                    {"match": {"entity_type.keyword": "Dataset"}},
                    {"terms": {"origin_samples.organ.keyword": filtered_supported_codes}},
                    {"match": {"origin_samples.sample_category.keyword": "organ"}}
                ]
            }
        },
        "size": 0
    }
    
    # 3. Count of datasets with organ in supported codes AND rui_location present
    registered_query = {
        "query": {
            "bool": {
                "filter": [
                    {"match": {"entity_type.keyword": "Dataset"}},
                    {"terms": {"origin_samples.organ.keyword": filtered_supported_codes}},
                    {"match": {"origin_samples.sample_category.keyword": "organ"}},
                    {"exists": {"field": "ancestors.rui_location"}}
                ]
            }
        },
        "size": 0
    }
    
    try:
        # Execute queries
        total_resp = requests.post(SENNET_SEARCH_API_URL, json=total_query, headers=sennet_headers)
        supported_resp = requests.post(SENNET_SEARCH_API_URL, json=supported_query, headers=sennet_headers)
        registered_resp = requests.post(SENNET_SEARCH_API_URL, json=registered_query, headers=sennet_headers)
        
        # Extract counts
        total_count = total_resp.json()['hits']['total']['value']
        supported_count = supported_resp.json()['hits']['total']['value']
        registered_count = registered_resp.json()['hits']['total']['value']
        
        print("SenNet Dataset Counts:")
        print(f"  Total datasets: {total_count}")
        print(f"  Supported datasets: {supported_count}")
        print(f"  Registered datasets: {registered_count}")
        
        return total_count, supported_count, registered_count
        
    except Exception as e:
        print(f"Error querying SenNet API: {e}")
        print("Note: Make sure you have a valid SenNet API token")
        return 0, 0, 0

# Execute SenNet queries
sennet_total, sennet_supported, sennet_registered = query_sennet_datasets()

## 8. Write Counts to CSV with Date as Column

In [None]:
def write_counts_to_csv():
    """Write the collected counts to CSV with dates as columns"""
    
    # Get current timestamp for column header
    current_date = datetime.now().strftime("%Y-%m-%d")
    
    # Prepare data: each row is a metric, each column is a date
    new_data = {
        'HuBMAP Total Datasets': hubmap_total,
        'HuBMAP Supported Datasets': hubmap_supported, 
        'HuBMAP Registered Datasets': hubmap_registered,
        'SenNet Total Datasets': sennet_total,
        'SenNet Supported Datasets': sennet_supported,
        'SenNet Registered Datasets': sennet_registered
    }
    
    # Check if CSV file exists
    file_exists = os.path.isfile(CSV_OUTPUT_PATH)
    
    if not file_exists:
        # Create new CSV file with headers and first data column
        with open(CSV_OUTPUT_PATH, 'w', newline='') as f:
            writer = csv.writer(f)
            
            # Write header row with metric names and first date
            header = ['Metric'] + [current_date]
            writer.writerow(header)
            
            # Write data rows
            for metric, value in new_data.items():
                writer.writerow([metric, value])
        
        print(f"Created new CSV file: {CSV_OUTPUT_PATH}")
        
    else:
        # Read existing CSV and add new column
        with open(CSV_OUTPUT_PATH, 'r', newline='') as f:
            reader = csv.reader(f)
            rows = list(reader)
        
        if len(rows) == 0:
            # Empty file, treat as new
            rows = [['Metric'], ['HuBMAP Total Datasets'], ['HuBMAP Supported Datasets'], 
                   ['HuBMAP Registered Datasets'], ['SenNet Total Datasets'], 
                   ['SenNet Supported Datasets'], ['SenNet Registered Datasets']]
        
        # Add new date to header
        rows[0].append(current_date)
        
        # Add new data to each metric row
        metric_to_row = {}
        for i, row in enumerate(rows[1:], 1):
            if len(row) > 0:
                metric_to_row[row[0]] = i
        
        # Ensure all metrics exist and add new values
        for metric, value in new_data.items():
            if metric in metric_to_row:
                rows[metric_to_row[metric]].append(value)
            else:
                # Add new metric row
                new_row = [metric] + [''] * (len(rows[0]) - 2) + [value]
                rows.append(new_row)
        
        # Pad any short rows
        max_cols = len(rows[0])
        for row in rows:
            while len(row) < max_cols:
                row.append('')
        
        # Write updated CSV
        with open(CSV_OUTPUT_PATH, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerows(rows)
        
        print(f"Updated existing CSV file: {CSV_OUTPUT_PATH}")
    
    # Display summary
    print(f"\\nData added for {current_date}:")
    for metric, value in new_data.items():
        print(f"  {metric}: {value}")

# Execute CSV writing
write_counts_to_csv()

## 9. Schedule Daily Execution

### Option 1: Using cron (macOS/Linux)

To schedule this notebook to run daily, you can:

1. **Convert to Python script**: Export this notebook as a `.py` file
2. **Add to crontab**: 
   ```bash
   # Edit crontab
   crontab -e
   
   # Add this line to run daily at 9 AM
   0 9 * * * /usr/bin/python3 /Users/dequeue/Desktop/RUI.nosync/hra-registrations/scripts/RUI\ Reporter/combined_daily_tracker.py
   ```

### Option 2: Using Python scheduler (within this notebook)

In [None]:
# Option 2: Run scheduler within Python (uncomment to use)
# Note: This requires keeping the notebook/script running continuously

"""
import schedule
import time

def daily_data_collection():
    print(f"Starting daily data collection at {datetime.now()}")
    
    # Re-run all the data collection functions
    hubmap_total, hubmap_supported, hubmap_registered = query_hubmap_datasets()
    sennet_total, sennet_supported, sennet_registered = query_sennet_datasets() 
    write_counts_to_csv()
    
    print("Daily data collection completed")

# Schedule the job for 9 AM daily
schedule.every().day.at("09:00").do(daily_data_collection)

# Keep the script running (uncomment next lines to activate)
# print("Scheduler started. Press Ctrl+C to stop.")
# while True:
#     schedule.run_pending()
#     time.sleep(60)  # Check every minute
"""

print("Scheduling options provided above.")
print("Choose Option 1 (cron) for production use, or Option 2 for testing.")

## Summary and Data Visualization

View the generated CSV file to track trends over time:

In [None]:
# Display the current CSV contents
if os.path.exists(CSV_OUTPUT_PATH):
    print(f"Current contents of {CSV_OUTPUT_PATH}:")
    print("=" * 80)
    
    # Read and display as DataFrame for better formatting
    try:
        df = pd.read_csv(CSV_OUTPUT_PATH, index_col=0)
        print(df)
        
        print(f"\nCSV file structure:")
        print(f"  Rows (metrics): {len(df)}")
        print(f"  Columns (dates): {len(df.columns)}")
        print(f"  Latest data: {df.columns[-1] if len(df.columns) > 0 else 'None'}")
        
        # Calculate registration percentages for latest data
        if len(df.columns) > 0:
            latest_col = df.columns[-1]
            print(f"\nRegistration Coverage for {latest_col}:")
            
            # HuBMAP percentages
            hubmap_supported_latest = df.loc['HuBMAP Supported Datasets', latest_col]
            hubmap_registered_latest = df.loc['HuBMAP Registered Datasets', latest_col]
            if hubmap_supported_latest > 0:
                hubmap_coverage = (hubmap_registered_latest / hubmap_supported_latest) * 100
                print(f"  HuBMAP: {hubmap_registered_latest}/{hubmap_supported_latest} = {hubmap_coverage:.1f}%")
            
            # SenNet percentages  
            sennet_supported_latest = df.loc['SenNet Supported Datasets', latest_col]
            sennet_registered_latest = df.loc['SenNet Registered Datasets', latest_col]
            if sennet_supported_latest > 0:
                sennet_coverage = (sennet_registered_latest / sennet_supported_latest) * 100
                print(f"  SenNet: {sennet_registered_latest}/{sennet_supported_latest} = {sennet_coverage:.1f}%")
        
    except Exception as e:
        print(f"Error reading CSV as DataFrame: {e}")
        # Fallback to simple text display
        with open(CSV_OUTPUT_PATH, 'r') as f:
            print(f.read())
else:
    print(f"CSV file not found at {CSV_OUTPUT_PATH}")
    print("Run the data collection cells above to generate the file.")