# Test Updated Output Checking Logic for TC-Risk Pipeline

This notebook tests the improved output-checking functions:
1. Level 3 must be finished (both env_wnd_*.nc and thermo_*.nc exist)
2. Level 4 is only checked within finished Level 3 runs
3. Level 4 completeness requires NUM_DRAWS + 1 files (251 files)
4. Report file counts per basin/model/variant/scenario/time_period

In [1]:
import pandas as pd
from pathlib import Path
import sys

# Add the source directory to path
sys.path.insert(0, '/ihme/homes/bcreiner/repos/idd-climate-models/src')

import idd_climate_models.constants as rfc

# Import constants
TC_RISK_OUTPUT_PATH = rfc.TC_RISK_OUTPUT_PATH
NUM_DRAWS = rfc.NUM_DRAWS
TIME_BINS_DF_PATH = rfc.TIME_BINS_DF_PATH
BASINS = ['EP', 'NA', 'NI', 'SI', 'AU', 'SP', 'WP']
DATA_SOURCE = 'cmip6'

print(f"TC_RISK_OUTPUT_PATH: {TC_RISK_OUTPUT_PATH}")
print(f"NUM_DRAWS: {NUM_DRAWS}")
print(f"NUM_DRAWS + 1 (required for Level 4): {NUM_DRAWS + 1}")
print(f"TIME_BINS_DF_PATH: {TIME_BINS_DF_PATH}")

TC_RISK_OUTPUT_PATH: /mnt/team/rapidresponse/pub/tropical-storms/tc_risk/output
NUM_DRAWS: 250
NUM_DRAWS + 1 (required for Level 4): 251
TIME_BINS_DF_PATH: /mnt/team/rapidresponse/pub/tropical-storms/tempestextremes/outputs/cmip6/time_bins.csv


## Load time bins and build task list

In [2]:
# Load time bins
time_bins_df = pd.read_csv(TIME_BINS_DF_PATH)

# Filter to BayesPoisson method only
time_bins_df = time_bins_df[time_bins_df['method'] == 'BayesPoisson']

# Create time_period column
time_bins_df['time_period'] = time_bins_df['start_year'].astype(str) + '-' + time_bins_df['end_year'].astype(str)

# Get unique combinations
unique_time_bins = time_bins_df[['model', 'variant', 'scenario', 'time_period', 'start_year', 'end_year']].drop_duplicates()

print(f"Total unique (model, variant, scenario, time_period) combinations: {len(unique_time_bins)}")
print(f"\nFirst few rows:")
print(unique_time_bins.head(10))

Total unique (model, variant, scenario, time_period) combinations: 130

First few rows:
        model   variant    scenario time_period  start_year  end_year
0   CMCC-ESM2  r1i1p1f1  historical   1970-1986        1970      1986
8   CMCC-ESM2  r1i1p1f1  historical   1987-2001        1987      2001
16  CMCC-ESM2  r1i1p1f1  historical   2002-2014        2002      2014
24  CMCC-ESM2  r1i1p1f1      ssp126   2015-2026        2015      2026
32  CMCC-ESM2  r1i1p1f1      ssp126   2027-2036        2027      2036
40  CMCC-ESM2  r1i1p1f1      ssp126   2037-2046        2037      2046
48  CMCC-ESM2  r1i1p1f1      ssp126   2047-2058        2047      2058
56  CMCC-ESM2  r1i1p1f1      ssp126   2059-2076        2059      2076
64  CMCC-ESM2  r1i1p1f1      ssp126   2077-2100        2077      2100
72  CMCC-ESM2  r1i1p1f1      ssp245   2015-2026        2015      2026


## Define improved output checking functions

In [3]:
def check_level3_output_finished(model, variant, scenario, time_period, data_source="cmip6"):
    """
    Check if Level 3 (global TC-risk) has finished successfully.
    Returns True if finished (has both env_wnd and thermo files), False otherwise.
    """
    output_path = TC_RISK_OUTPUT_PATH / data_source / model / variant / scenario / time_period
    if not output_path.exists():
        return False
    
    # Check for required Level 3 output files
    env_wnd_files = list(output_path.glob("env_wnd_*.nc"))
    thermo_files = list(output_path.glob("thermo_*.nc"))
    
    # Complete if both files exist
    return len(env_wnd_files) > 0 and len(thermo_files) > 0

def get_level4_basin_file_count(model, variant, scenario, time_period, basin, data_source="cmip6"):
    """
    Get the number of files in a Level 4 basin output folder.
    Returns the count, or -1 if folder doesn't exist.
    """
    output_path = TC_RISK_OUTPUT_PATH / data_source / model / variant / scenario / time_period / basin
    if not output_path.exists():
        return -1
    
    files = list(output_path.glob("*.nc"))
    return len(files)

def check_level4_output_complete(model, variant, scenario, time_period, basin, data_source="cmip6"):
    """
    Check if Level 4 (basin TC-risk) output is complete.
    Returns True if has NUM_DRAWS + 1 files, False otherwise.
    """
    file_count = get_level4_basin_file_count(model, variant, scenario, time_period, basin, data_source)
    return file_count >= NUM_DRAWS + 1

print("Output checking functions defined successfully")

Output checking functions defined successfully


## Build all tasks from time bins

In [4]:
# Build all tasks from time bins
all_tasks = []
for _, row in unique_time_bins.iterrows():
    all_tasks.append({
        'model': row['model'],
        'variant': row['variant'],
        'scenario': row['scenario'],
        'time_period': row['time_period'],
    })

print(f"Total tasks: {len(all_tasks)}")

Total tasks: 130


## TEST 1: Check Level 3 Completion (IMPROVED)

In [5]:
# Check Level 3 outputs - MUST HAVE BOTH env_wnd and thermo files
level3_finished = []
level3_not_finished = []

for task in all_tasks:
    model = task['model']
    variant = task['variant']
    scenario = task['scenario']
    time_period = task['time_period']
    
    if check_level3_output_finished(model, variant, scenario, time_period, DATA_SOURCE):
        level3_finished.append(task)
    else:
        level3_not_finished.append(task)

print(f"\n=== LEVEL 3 OUTPUT STATUS ===")
print(f"Finished (ready for L4):   {len(level3_finished)}")
print(f"Not finished (skip L4):    {len(level3_not_finished)}")
print(f"Total:                     {len(all_tasks)}")

if level3_finished:
    print(f"\nFirst 10 finished Level 3 tasks (L4 can run on these):")
    for task in level3_finished[:10]:
        print(f"  {task['model']}/{task['variant']}/{task['scenario']}/{task['time_period']}")

if level3_not_finished:
    print(f"\nFirst 10 NOT finished Level 3 tasks (L4 will NOT run on these):")
    for task in level3_not_finished[:10]:
        print(f"  {task['model']}/{task['variant']}/{task['scenario']}/{task['time_period']}")


=== LEVEL 3 OUTPUT STATUS ===
Finished (ready for L4):   108
Not finished (skip L4):    22
Total:                     130

First 10 finished Level 3 tasks (L4 can run on these):
  CMCC-ESM2/r1i1p1f1/historical/1970-1986
  CMCC-ESM2/r1i1p1f1/historical/1987-2001
  CMCC-ESM2/r1i1p1f1/historical/2002-2014
  CMCC-ESM2/r1i1p1f1/ssp126/2015-2026
  CMCC-ESM2/r1i1p1f1/ssp126/2027-2036
  CMCC-ESM2/r1i1p1f1/ssp126/2037-2046
  CMCC-ESM2/r1i1p1f1/ssp126/2047-2058
  CMCC-ESM2/r1i1p1f1/ssp126/2059-2076
  CMCC-ESM2/r1i1p1f1/ssp245/2015-2026
  CMCC-ESM2/r1i1p1f1/ssp245/2027-2036

First 10 NOT finished Level 3 tasks (L4 will NOT run on these):
  CMCC-ESM2/r1i1p1f1/ssp126/2077-2100
  MRI-ESM2-0/r1i1p1f1/ssp126/2015-2042
  MRI-ESM2-0/r1i1p1f1/ssp126/2043-2071
  MRI-ESM2-0/r1i1p1f1/ssp126/2072-2100
  MRI-ESM2-0/r2i1p1f1/historical/1970-1999
  MRI-ESM2-0/r2i1p1f1/ssp126/2029-2055
  MRI-ESM2-0/r2i1p1f1/ssp126/2071-2100
  MRI-ESM2-0/r2i1p1f1/ssp245/2015-2044
  MRI-ESM2-0/r2i1p1f1/ssp245/2073-2100
  MRI-ESM2

## TEST 2: Check Level 4 Basin Completeness (IMPROVED)
ONLY for tasks whose Level 3 finished, check if basins have NUM_DRAWS + 1 files

In [6]:
# Check Level 4 outputs ONLY for finished Level 3 tasks
level4_tasks_with_incomplete_basins = []
level4_fully_complete = []

for task in level3_finished:
    model = task['model']
    variant = task['variant']
    scenario = task['scenario']
    time_period = task['time_period']
    
    basin_file_counts = {}
    incomplete_basins = []
    
    for basin in BASINS:
        file_count = get_level4_basin_file_count(model, variant, scenario, time_period, basin, DATA_SOURCE)
        basin_file_counts[basin] = file_count
        
        # Incomplete if fewer than NUM_DRAWS + 1 files
        if file_count < NUM_DRAWS + 1:
            incomplete_basins.append(basin)
    
    if incomplete_basins:
        level4_tasks_with_incomplete_basins.append({
            **task,
            'incomplete_basins': incomplete_basins,
            'basin_file_counts': basin_file_counts
        })
    else:
        level4_fully_complete.append(task)

print(f"\n=== LEVEL 4 OUTPUT STATUS (Only for finished Level 3) ===")
print(f"Combos with incomplete basins: {len(level4_tasks_with_incomplete_basins)}")
print(f"Combos fully complete:         {len(level4_fully_complete)}")
print(f"Total checked (L3 finished):   {len(level3_finished)}")
print(f"\nRequired files per basin: {NUM_DRAWS + 1}")


=== LEVEL 4 OUTPUT STATUS (Only for finished Level 3) ===
Combos with incomplete basins: 108
Combos fully complete:         0
Total checked (L3 finished):   108

Required files per basin: 251


In [8]:
level4_tasks_with_incomplete_basins

[{'model': 'CMCC-ESM2',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'time_period': '1970-1986',
  'incomplete_basins': ['EP', 'NA', 'NI', 'SI', 'AU', 'SP', 'WP'],
  'basin_file_counts': {'EP': 37,
   'NA': 27,
   'NI': 53,
   'SI': 38,
   'AU': 41,
   'SP': 54,
   'WP': 37}},
 {'model': 'CMCC-ESM2',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'time_period': '1987-2001',
  'incomplete_basins': ['EP', 'NA', 'NI', 'SI', 'AU', 'SP', 'WP'],
  'basin_file_counts': {'EP': 42,
   'NA': 77,
   'NI': 54,
   'SI': 49,
   'AU': 47,
   'SP': 49,
   'WP': 38}},
 {'model': 'CMCC-ESM2',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'time_period': '2002-2014',
  'incomplete_basins': ['EP', 'NA', 'NI', 'SI', 'AU', 'SP', 'WP'],
  'basin_file_counts': {'EP': 46,
   'NA': 55,
   'NI': 53,
   'SI': 48,
   'AU': 46,
   'SP': 70,
   'WP': 46}},
 {'model': 'CMCC-ESM2',
  'variant': 'r1i1p1f1',
  'scenario': 'ssp126',
  'time_period': '2015-2026',
  'incomplete_basins': ['EP', 'NA

## TEST 3: Detailed Level 4 Basin File Count Analysis

In [7]:
# Show detailed file counts for incomplete Level 4 tasks
if level4_tasks_with_incomplete_basins:
    print(f"\nDetailed Level 4 status (showing incomplete basins):")
    print(f"Displaying first 15 combos with incomplete basins:\n")
    
    for task in level4_tasks_with_incomplete_basins[:15]:
        print(f"  {task['model']}/{task['variant']}/{task['scenario']}/{task['time_period']}")
        for basin in BASINS:
            count = task['basin_file_counts'][basin]
            if count == -1:
                status = "✗ MISSING"
                print(f"    {basin}: --- (folder doesn't exist) {status}")
            elif count >= NUM_DRAWS + 1:
                status = "✓ COMPLETE"
                print(f"    {basin}: {count:3d}/{NUM_DRAWS + 1} files {status}")
            else:
                status = f"✗ INCOMPLETE ({NUM_DRAWS + 1 - count} more needed)"
                print(f"    {basin}: {count:3d}/{NUM_DRAWS + 1} files {status}")
        print()
else:
    print("\nNo incomplete Level 4 tasks found!")


Detailed Level 4 status (showing incomplete basins):
Displaying first 15 combos with incomplete basins:

  CMCC-ESM2/r1i1p1f1/historical/1970-1986
    EP:  37/251 files ✗ INCOMPLETE (214 more needed)
    NA:  27/251 files ✗ INCOMPLETE (224 more needed)
    NI:  53/251 files ✗ INCOMPLETE (198 more needed)
    SI:  38/251 files ✗ INCOMPLETE (213 more needed)
    AU:  41/251 files ✗ INCOMPLETE (210 more needed)
    SP:  54/251 files ✗ INCOMPLETE (197 more needed)
    WP:  37/251 files ✗ INCOMPLETE (214 more needed)

  CMCC-ESM2/r1i1p1f1/historical/1987-2001
    EP:  42/251 files ✗ INCOMPLETE (209 more needed)
    NA:  77/251 files ✗ INCOMPLETE (174 more needed)
    NI:  54/251 files ✗ INCOMPLETE (197 more needed)
    SI:  49/251 files ✗ INCOMPLETE (202 more needed)
    AU:  47/251 files ✗ INCOMPLETE (204 more needed)
    SP:  49/251 files ✗ INCOMPLETE (202 more needed)
    WP:  38/251 files ✗ INCOMPLETE (213 more needed)

  CMCC-ESM2/r1i1p1f1/historical/2002-2014
    EP:  46/251 files ✗ 

## TEST 4: Summary Statistics

In [None]:
# Count total individual basin tasks that need to be run
total_incomplete_basin_tasks = sum(len(task['incomplete_basins']) for task in level4_tasks_with_incomplete_basins)

print("\n" + "="*80)
print("=== COMPREHENSIVE SUMMARY ===")
print("="*80)

print(f"\nPipeline Status:")
print(f"\n  Level 3 (Global TC-Risk):")
print(f"    - Finished (ready for L4):    {len(level3_finished)}")
print(f"    - Not finished (skip L4):     {len(level3_not_finished)}")
print(f"    - Total:                      {len(all_tasks)}")

print(f"\n  Level 4 (Basin TC-Risk):")
print(f"    - Combos with incomplete basins: {len(level4_tasks_with_incomplete_basins)}")
print(f"    - Total individual basin tasks:  {total_incomplete_basin_tasks}")
print(f"    - Combos fully complete:         {len(level4_fully_complete)}")

# Distribution of file counts
print(f"\n  Level 4 Basin File Count Distribution:")
all_file_counts = []
for task in level4_tasks_with_incomplete_basins:
    for count in task['basin_file_counts'].values():
        if count >= 0:
            all_file_counts.append(count)

if all_file_counts:
    print(f"    - Min files in any basin:     {min(all_file_counts)}")
    print(f"    - Max files in any basin:     {max(all_file_counts)}")
    print(f"    - Average files per basin:    {sum(all_file_counts) / len(all_file_counts):.1f}")
    print(f"    - Required for completion:    {NUM_DRAWS + 1}")

# Empty basins (no folder)
empty_basins = sum(1 for task in level4_tasks_with_incomplete_basins 
                   for count in task['basin_file_counts'].values() 
                   if count == -1)
print(f"    - Basins with no folder:      {empty_basins}")

print(f"\nEstimated Runtime (if running):")
print(f"  - Level 4: ~{total_incomplete_basin_tasks * 120} minutes (assuming 120 min per basin task)")
print(f"            = ~{total_incomplete_basin_tasks * 120 / 60:.1f} hours")

## TEST 5: Verify Path Structure and File Details

In [None]:
# Show sample completed and incomplete paths
print("\n=== SAMPLE PATHS ===")

if level3_finished:
    sample_task = level3_finished[0]
    sample_path = TC_RISK_OUTPUT_PATH / DATA_SOURCE / sample_task['model'] / sample_task['variant'] / sample_task['scenario'] / sample_task['time_period']
    print(f"\nSample Level 3 FINISHED path:")
    print(f"  {sample_path}")
    print(f"  Exists: {sample_path.exists()}")
    if sample_path.exists():
        all_files = list(sample_path.glob("*.nc"))
        env_wnd = list(sample_path.glob("env_wnd_*.nc"))
        thermo = list(sample_path.glob("thermo_*.nc"))
        print(f"  Total files: {len(all_files)}")
        print(f"  env_wnd_*.nc files: {len(env_wnd)}")
        print(f"  thermo_*.nc files: {len(thermo)}")

if level4_tasks_with_incomplete_basins:
    sample_task = level4_tasks_with_incomplete_basins[0]
    sample_basin = sample_task['incomplete_basins'][0]
    sample_path = TC_RISK_OUTPUT_PATH / DATA_SOURCE / sample_task['model'] / sample_task['variant'] / sample_task['scenario'] / sample_task['time_period'] / sample_basin
    print(f"\nSample Level 4 INCOMPLETE basin path:")
    print(f"  {sample_path}")
    print(f"  Exists: {sample_path.exists()}")
    if sample_path.exists():
        files = list(sample_path.glob("*.nc"))
        print(f"  File count: {len(files)}/{NUM_DRAWS + 1}")
        print(f"  Missing: {NUM_DRAWS + 1 - len(files)} files")
        if files:
            print(f"  Sample files:")
            for file in sorted(files)[:3]:
                print(f"    - {file.name}")
            if len(files) > 3:
                print(f"    ... and {len(files) - 3} more")