In [1]:
import sys
import getpass
import uuid
import os
from jobmon.client.tool import Tool # type: ignore
from pathlib import Path
from typing import Dict, Any

import idd_climate_models.constants as rfc
from idd_climate_models.dictionary_utils import parse_results
from idd_climate_models.io_compare_utils import compare_model_validation

repo_name = rfc.repo_name
package_name = rfc.package_name
DATA_DIR = rfc.RAW_DATA_PATH 
PROCESSED_DATA_PATH = rfc.PROCESSED_DATA_PATH 
SCRIPT_ROOT = rfc.REPO_ROOT / repo_name / "src" / package_name / "clean_and_split"

INPUT_DATA_TYPE = "data"
INPUT_IO_TYPE = "raw"
OUTPUT_DATA_TYPE = "data"
OUTPUT_IO_TYPE = "processed" 
DATA_SOURCE = "cmip6" 

TEST_MODE = False

def get_file_size_gb(file_path: Path) -> float:
    return os.path.getsize(file_path) / (1024**3)

def get_resource_tier(file_size_gb: float, REQUIRED_MEM_FACTOR: float = 4.0,
                      MIN_MEM_GB: float = 8.0, MAX_MEM_GB: float = 64.0) -> Dict[str, Any]:
    required_mem_gb = int(file_size_gb * REQUIRED_MEM_FACTOR) + 2 
    memory = f"{min(MAX_MEM_GB, max(MIN_MEM_GB, required_mem_gb))}G"
    if file_size_gb < 1.0:
        runtime = "10m"
        cores = 4
    elif file_size_gb < 5.0: 
        runtime = "20m" 
        cores = 4
    else: 
        runtime = "30m"
        cores = 8 
    return {
        "memory": memory,
        "cores": cores,
        "runtime": runtime
    }

# ================================================================================

# Use the unified function for validation and comparison
validation_info = compare_model_validation(
    input_data_type=INPUT_DATA_TYPE,
    input_io_type=INPUT_IO_TYPE,
    output_data_type=OUTPUT_DATA_TYPE,
    output_io_type=OUTPUT_IO_TYPE,
    data_source=DATA_SOURCE,
    verbose=True,
    produce_missing_input_report = True,
)

model_variants_to_process = validation_info["model_variants_to_process"]
input_results_for_tasks = validation_info["model_variants_to_process_dict"]["validation_results"]
input_complete_model_variants = validation_info["input_complete_model_variants"]
output_complete_model_variants = validation_info["output_complete_model_variants"]

tasks_to_run = parse_results(
    validation_dict=validation_info["model_variants_to_process_dict"],
    detail='all',
)

processed_only_model_variants = output_complete_model_variants - input_complete_model_variants


STEP 1.1: Input Data - Validating data raw models (Strict Check: False).
[DATA] Starting full validation for: cmip6 (Strict Check: False)...
[1/4] Validating CMCC-ESM2
  ✓ CMCC-ESM2/r1i1p1f1: Complete
[2/4] Validating EC-Earth3
  ✓ EC-Earth3/r1i1p1f1: Complete
[3/4] Validating MPI-ESM1-2-HR
  ✓ MPI-ESM1-2-HR/r1i1p1f1: Complete
[4/4] Validating MRI-ESM2-0
  ✓ MRI-ESM2-0/r1i1p1f1: Complete
  ✓ MRI-ESM2-0/r2i1p1f1: Complete
  ✓ MRI-ESM2-0/r3i1p1f1: Complete
  ✓ MRI-ESM2-0/r4i1p1f1: Complete
  ✓ MRI-ESM2-0/r5i1p1f1: Complete

Validation Summary: 8 complete, 0 incomplete model/variant combinations

Validation complete for: data, raw, cmip6
Summary: 4/4 models complete. Parsed log (up to 'grid') written to /mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/validation_log.json

STEP 1.2: Output Data - Validating data processed models (Check for already completed work).

[DATA] Starting full validation for: cmip6 (Strict Check: False)...
[1/1] Validating MRI-ESM2-0
  ✗ MRI-ESM2-0/r4i1

In [3]:
import pandas as pd
df = pd.read_csv(rfc.TIME_BINS_WIDE_DF_PATH)

In [5]:
# Print all columns that are of the form: '{basin}_int'
int_columns = [col for col in df.columns if col.endswith('_int')]
print(df[int_columns])

     AU_int  EP_int  GL_int  NA_int  NI_int  SI_int  SP_int  WP_int
0        13      24      92      17       4      12       3      27
1        12      20      82       0       5       9       5      25
2        13      19      82       3       5      10       2      26
3        13      21      90      20       5      11       4      26
4        14      21      90       7       5      11       6      26
..      ...     ...     ...     ...     ...     ...     ...     ...
125      11      24      77       4       2       8       3      23
126      10      24      83       9       5      10       2      25
127      10      24      80       6       4       7       2      26
128      11      19      72       6       2       6       4      23
129      10      20      70       2       3       7       5      20

[130 rows x 8 columns]


In [14]:
if processed_only_model_variants:
    print("\n⚠️ WARNING: Found PROCESSED models that are NOT complete in RAW data:")
    print(f"   {processed_only_model_variants}")
    print("   These models may have been processed from incomplete source data.")

print("\n" + "=" * 80)
print(f"SUMMARY: {len(input_complete_model_variants)} complete RAW models found.")
print(f"         {len(output_complete_model_variants)} complete PROCESSED models found.")
print(f"         {len(model_variants_to_process)} unique models require processing.")
print(f"         Resulting in {len(tasks_to_run)} files to process.")
print("=" * 80)

if not tasks_to_run:
    print("\n✅ Execution halted: All required files have either been processed or are incomplete in the raw data.")
    exit(0)


SUMMARY: 7 complete RAW models found.
         0 complete PROCESSED models found.
         7 unique models require processing.
         Resulting in 1696 files to process.


In [9]:
tasks_to_run


[{'model': 'CMCC-ESM2',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'variable': 'hus',
  'grid': 'gn',
  'frequency': 'Amon',
  'file_path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/CMCC-ESM2/r1i1p1f1/historical/hus/gn/Amon/hus_Amon_CMCC-ESM2_historical_r1i1p1f1_gn_185001-187412.nc',
  'fill_required': False},
 {'model': 'CMCC-ESM2',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'variable': 'hus',
  'grid': 'gn',
  'frequency': 'Amon',
  'file_path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/CMCC-ESM2/r1i1p1f1/historical/hus/gn/Amon/hus_Amon_CMCC-ESM2_historical_r1i1p1f1_gn_187501-189912.nc',
  'fill_required': False},
 {'model': 'CMCC-ESM2',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'variable': 'hus',
  'grid': 'gn',
  'frequency': 'Amon',
  'file_path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/CMCC-ESM2/r1i1p1f1/historical/hus/gn/Amon/hus_Amon_CMCC-ESM2_historical_r1i1p1f1_gn_190001-192412.nc',
 

In [None]:

    
    





# Filter and print
results = validation_info['input_validation_dict']['validation_results']
incomplete_only = {}

for model_name, model_data in results.items():
    filtered = filter_to_incomplete_only(model_data)
    if filtered is not None:
        incomplete_only[model_name] = filtered

incomplete_only

{'EC-Earth3': {'variant': {'r1i1p1f1': {'scenario': {'ssp126': {'variable': {'ua': {'complete': False,
        'issues': ["Grid Priority: Found multiple grids. Selecting highest priority: 'gr'"],
        'grid': {'gr': {'complete': False,
          'issues': [],
          'frequency': {'day': {'complete': False,
            'files': [{'path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/EC-Earth3/r1i1p1f1/ssp126/ua/gr/day/ua_day_EC-Earth3_ssp126_r1i1p1f1_gr_20220101-20221231.nc',
              'fill_required': False},
             {'path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/EC-Earth3/r1i1p1f1/ssp126/ua/gr/day/ua_day_EC-Earth3_ssp126_r1i1p1f1_gr_20250101-20251231.nc',
              'fill_required': False},
             {'path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/EC-Earth3/r1i1p1f1/ssp126/ua/gr/day/ua_day_EC-Earth3_ssp126_r1i1p1f1_gr_20270101-20271231.nc',
              'fill_required': False},
             {'path': '/mnt/t

In [None]:
from datetime import datetime, timedelta
import re
from pathlib import Path
import idd_climate_models.constants as rfc
from idd_climate_models.validation_functions import (
    int_to_date, 
    is_monthly, 
    extract_date_ranges
)

START_YEAR = rfc.START_YEAR
END_YEAR = rfc.END_YEAR




import pandas as pd

import pandas as pd



# Run the analysis
records = analyze_incomplete_variables(incomplete_only)

# Create DataFrame
missing_years_df = pd.DataFrame(records)

# Save to CSV
output_path = DATA_DIR / DATA_SOURCE / "missing_variables.csv"
missing_years_df.to_csv(output_path, index=False)
os.chmod(output_path, 0o775)

# Display summary
print(f"Total missing year entries: {len(missing_years_df)}")
print(f"Saved to: {output_path}")
print(f"\nFirst 50 rows:")
missing_years_df.head(50)

Total missing year entries: 252
Saved to: /mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/missing_variables.csv

First 50 rows:


Unnamed: 0,Model,Variant,Scenario,Variable,Grid,Frequency,Missing_Year
0,EC-Earth3,r1i1p1f1,ssp126,ua,gr,day,2015
1,EC-Earth3,r1i1p1f1,ssp126,ua,gr,day,2016
2,EC-Earth3,r1i1p1f1,ssp126,ua,gr,day,2017
3,EC-Earth3,r1i1p1f1,ssp126,ua,gr,day,2018
4,EC-Earth3,r1i1p1f1,ssp126,ua,gr,day,2019
5,EC-Earth3,r1i1p1f1,ssp126,ua,gr,day,2020
6,EC-Earth3,r1i1p1f1,ssp126,ua,gr,day,2021
7,EC-Earth3,r1i1p1f1,ssp126,ua,gr,day,2023
8,EC-Earth3,r1i1p1f1,ssp126,ua,gr,day,2024
9,EC-Earth3,r1i1p1f1,ssp126,ua,gr,day,2026


In [15]:
# Let's debug - check if 'zg' actually has any complete: False
def find_all_complete_values(obj, path=""):
    """Find all 'complete' values in the nested structure"""
    results = []
    if isinstance(obj, dict):
        if 'complete' in obj:
            results.append((path, obj['complete']))
        for key, value in obj.items():
            results.extend(find_all_complete_values(value, f"{path}.{key}" if path else key))
    return results

# Check what's actually in 'zg'
if 'zg' in incomplete:
    print("Complete values in 'zg':")
    for path, value in find_all_complete_values(incomplete['zg']):
        print(f"  {path}: {value}")