In [2]:
import sys
import getpass
from pathlib import Path

# New/Corrected Imports for Validation Functions
import idd_climate_models.constants as rfc
from idd_climate_models.validation_functions import validate_all_models_in_source, create_validation_dict
from idd_climate_models.dictionary_utils import parse_results, nest_parsed_results

# --- Path and Configuration Setup ---
repo_name = rfc.repo_name
package_name = rfc.package_name
DATA_DIR = rfc.RAW_DATA_PATH # This is likely MODEL_ROOT / data / raw
PROCESSED_DATA_PATH = rfc.PROCESSED_DATA_PATH # This is likely MODEL_ROOT / data / processed
SCRIPT_ROOT = rfc.REPO_ROOT / repo_name / "src" / package_name / "clean_and_split"

# Configuration
DATA_TYPE = "data"
IO_TYPE_RAW = "raw"
IO_TYPE_PROCESSED = "processed" # NEW: Defines the output target location
DATA_SOURCE = "cmip6" # e.g., 'cmip6', or loop over this if needed

TEST_MODE = True

# ================================================================================
# STEP 1: Validate RAW and PROCESSED data structures
# ================================================================================

print("\n" + "=" * 80)
print(f"STEP 1.1: Validating RAW models for task creation (Source: {DATA_SOURCE}).")
print("================================================================================")




raw_validation_dict = create_validation_dict(DATA_TYPE, IO_TYPE_RAW, DATA_SOURCE)

# 1. Run validation on RAW data (to find structural completeness and files needing fill)
raw_validation_dict = validate_all_models_in_source(
    validation_dict = raw_validation_dict,
    verbose=True
)
# Extract the set of models that are structurally complete in the RAW source
raw_complete_models = {
    model for model, data in raw_validation_dict['validation_results'].items() 
    if data.get('complete', False)
}

raw_incomplete_models = {
    model for model, data in raw_validation_dict['validation_results'].items() 
    if not data.get('complete', False)
}


print("\n" + "=" * 80)
print("STEP 1.2: Validating PROCESSED models (Check for already completed work).")
print("================================================================================")

processed_validation_dict = create_validation_dict(DATA_TYPE, IO_TYPE_PROCESSED, DATA_SOURCE)
# 2. Run validation on PROCESSED data (to find models already done)
processed_validation_dict = validate_all_models_in_source(
    validation_dict = processed_validation_dict,
    verbose=True
)
# Extract the set of models that are structurally complete in the PROCESSED target
processed_complete_models = {
    model for model, data in processed_validation_dict['validation_results'].items()
    if data.get('complete', False)
}

processed_incomplete_models = {
    model for model, data in processed_validation_dict['validation_results'].items()
    if not data.get('complete', False)
}


STEP 1.1: Validating RAW models for task creation (Source: cmip6).
[DATA] Starting full validation for: cmip6...
[1/26] Validating ACCESS-CM2... [2/26] Validating ACCESS-ESM1-5... [3/26] Validating AWI-CM-1-1-MR... [4/26] Validating BCC-CSM2-MR... [5/26] Validating CESM2... [6/26] Validating CESM2-WACCM... [7/26] Validating CMCC-CM2-SR5... [8/26] Validating CMCC-ESM2... [9/26] Validating CanESM5... [10/26] Validating EC-Earth3... [11/26] Validating EC-Earth3-Veg... [12/26] Validating EC-Earth3-Veg-LR... [13/26] Validating FGOALS-g3... [14/26] Validating IITM-ESM... [15/26] Validating INM-CM4-8... [16/26] Validating INM-CM5-0... [17/26] Validating IPSL-CM6A-LR... [18/26] Validating KACE-1-0-G... [19/26] Validating MIROC6... [20/26] Validating MPI-ESM1-2-HR... [21/26] Validating MPI-ESM1-2-LR... [22/26] Validating MRI-ESM2-0... [23/26] Validating NESM3... [24/26] Validating NorESM2-LM... [25/26] Validating NorESM2-MM... [26/26] Validating TaiESM1... 
Validation complete for: cmip6
Summa

In [3]:
raw_incomplete_models

{'CESM2', 'CESM2-WACCM', 'KACE-1-0-G'}

In [10]:

def find_first_failure(validation_data, path="Model"):
    if 'complete' in validation_data and validation_data['complete'] is False:
        issues = validation_data.get('issues', [])
        if issues:
            return (
                f"Validation Failed at: **{path}**\n"
                f"Issues ({len(issues)}): {'; '.join(issues)}"
            )
    for key, child_data in validation_data.items():
        if isinstance(child_data, dict) and key not in ['issues', 'complete', 'files']:
            for child_name, nested_result in child_data.items():
                new_path = f"{path} -> {key}={child_name}"
                failure_summary = find_first_failure(nested_result, new_path)
                if failure_summary:
                    return failure_summary   
    return None


def summarize_all_failures(validation_dict):
    """
    Iterates through all models in the validation_dict, uses find_first_failure
    to locate the highest-level issue for each incomplete model, and returns 
    a summary dictionary.
    
    Args:
        validation_dict (dict): The dictionary containing 'validation_results'.
        
    Returns:
        dict: A dictionary of {model_name: failure_summary_string}.
    """
    
    all_results = validation_dict['validation_results']
    failure_summaries = {}
    
    print("\n" + "=" * 80)
    print("ANALYZING FAILURES (Highest-Level Issue Per Incomplete Model)")
    print("=" * 80)

    for model_name, model_data in all_results.items():
        
        # 1. Check if the model is incomplete at the top level
        if not model_data.get('complete', False):
            
            # 2. Use find_first_failure to pinpoint the exact failure location.
            # We start the search one level down, passing the entire nested result.
            failure_message = find_first_failure(model_data, path=f"Model={model_name}")
            
            # The result should always be found if the top level is 'False'
            if failure_message:
                summary = f"✗ INCOMPLETE: {failure_message}"
                failure_summaries[model_name] = summary
                print(summary)
            else:
                # Should not happen if data structure is valid
                failure_summaries[model_name] = "✗ INCOMPLETE: Failed at model root, but no specific issue found deeper."
                print(f"Model={model_name}: {failure_summaries[model_name]}")
        
    print("=" * 80)
    return failure_summaries

summarize_all_failures(raw_validation_dict)


ANALYZING FAILURES (Highest-Level Issue Per Incomplete Model)
✗ INCOMPLETE: Validation Failed at: **Model=CESM2 -> variant=r1i1p1f1**
Issues (1): Missing required folders: ['ssp126', 'ssp245', 'ssp585']
✗ INCOMPLETE: Validation Failed at: **Model=CESM2-WACCM -> variant=r1i1p1f1 -> scenario=historical -> variable=tos**
Issues (1): Expected 1 folder(s), found 2: ['gn', 'gr']
✗ INCOMPLETE: Validation Failed at: **Model=KACE-1-0-G -> variant=r1i1p1f1 -> scenario=historical -> variable=ua -> grid=gr -> frequency=day**
Issues (1): Coverage ends too early: 20141230, expected until: 20141231


{'CESM2': "✗ INCOMPLETE: Validation Failed at: **Model=CESM2 -> variant=r1i1p1f1**\nIssues (1): Missing required folders: ['ssp126', 'ssp245', 'ssp585']",
 'CESM2-WACCM': "✗ INCOMPLETE: Validation Failed at: **Model=CESM2-WACCM -> variant=r1i1p1f1 -> scenario=historical -> variable=tos**\nIssues (1): Expected 1 folder(s), found 2: ['gn', 'gr']",
 'KACE-1-0-G': '✗ INCOMPLETE: Validation Failed at: **Model=KACE-1-0-G -> variant=r1i1p1f1 -> scenario=historical -> variable=ua -> grid=gr -> frequency=day**\nIssues (1): Coverage ends too early: 20141230, expected until: 20141231'}

In [12]:
raw_validation_dict['validation_results']

{'ACCESS-CM2': {'complete': True,
  'issues': [],
  'variant': {'r1i1p1f1': {'complete': True,
    'issues': [],
    'scenario': {'historical': {'complete': True,
      'issues': [],
      'variable': {'hus': {'complete': True,
        'issues': [],
        'grid': {'gn': {'complete': True,
          'issues': [],
          'frequency': {'Amon': {'complete': True,
            'files': [{'path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/ACCESS-CM2/r1i1p1f1/historical/hus/gn/Amon/hus_Amon_ACCESS-CM2_historical_r1i1p1f1_gn_195001-201412.nc',
              'fill_required': False}],
            'issues': []}}}}},
       'psl': {'complete': True,
        'issues': [],
        'grid': {'gn': {'complete': True,
          'issues': [],
          'frequency': {'Amon': {'complete': True,
            'files': [{'path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/ACCESS-CM2/r1i1p1f1/historical/psl/gn/Amon/psl_Amon_ACCESS-CM2_historical_r1i1p1f1_gn_185001-201412.nc'

In [2]:
# ================================================================================
# STEP 2: Compare Model Sets and Filter Tasks
# ================================================================================

print("\n" + "=" * 80)
print("STEP 2: Comparing validation results and generating tasks.")
print("================================================================================")

# 3. Find models that need processing (complete in RAW but NOT complete in PROCESSED)
models_to_process = raw_complete_models - processed_complete_models

# 4. Filter the RAW results down to ONLY the models we need to run jobs for
raw_results_for_tasks = {
    model: data for model, data in raw_validation_results.items() 
    if model in models_to_process
}



STEP 2: Comparing validation results and generating tasks.


In [7]:
FOLDER_STRUCTURE = {
    'data': ['model', 'variant', 'scenario', 'variable', 'grid', 'frequency'],
    'tc_risk': ['model', 'variant', 'scenario', 'time-period']
}


In [None]:
validation_dict = {
    'data_type': DATA_TYPE,
    'io_data_type': IO_TYPE_RAW,
    'data_source': DATA_SOURCE,
    'validation_results': raw_results_for_tasks
}
def parse_results(validation_dict, detail='variant'):
    """
    If detail='all', collect every file with its path and fill_required flag,
    plus all parent keys. Otherwise, collect up to the specified detail level.
    """
    folder_levels = FOLDER_STRUCTURE[validation_dict['data_type']]
    flat_path_list = []

    def recursive_collect(data, levels, context):
        current_level = levels[0]
        children = data.get(current_level, {})
        for name, child_data in children.items():
            new_context = context.copy()
            new_context[current_level] = name
            if len(levels) == 1:
                # At the leaf level, collect file info if detail='all'
                if detail == 'all' and 'files' in child_data:
                    for file_meta in child_data['files']:
                        flat_path_list.append({
                            **new_context,
                            'file_path': file_meta['path'],
                            'fill_required': file_meta.get('fill_required', False)
                        })
                else:
                    flat_path_list.append(new_context)
            else:
                recursive_collect(child_data, levels[1:], new_context)

    # Determine levels to use
    if detail == 'all':
        levels_to_use = folder_levels[1:]  # skip 'model', handled separately
    else:
        levels_to_use = folder_levels[:folder_levels.index(detail)+1][1:]

    # Start from the top level (model)
    for model, data in validation_dict['validation_results'].items():
        context = {'model': model}
        if not levels_to_use:
            flat_path_list.append(context)
        else:
            recursive_collect(data, levels_to_use, context)

    return flat_path_list

def nest_parsed_results(flat_path_list, data_type):

    if not flat_path_list:
        return {}

    full_levels = ['model'] + FOLDER_STRUCTURE.get(data_type, [])
    first_item_keys = set(flat_path_list[0].keys())
    nesting_levels = [level for level in full_levels if level in first_item_keys]

    if len(nesting_levels) < 2:
        return {} # Cannot create key-value pair if there aren't two final levels
        
    terminal_key = nesting_levels[-1]        # e.g., 'grid'
    penultimate_key = nesting_levels[-2]   # e.g., 'variable'
    
    nested_results = {}

    for path_dict in flat_path_list:
        model_name = path_dict['model']
        if model_name not in nested_results:
            nested_results[model_name] = {}
        current_level_ptr = nested_results[model_name]
        for i, level_key in enumerate(nesting_levels):
            name = path_dict[level_key]
            if level_key == 'model':
                continue # Skip model level
            if level_key == penultimate_key:
                current_level_ptr[name] = path_dict[terminal_key]
                break # Stop processing this path_dict, the structure is complete.
            else:
                if name not in current_level_ptr:
                    current_level_ptr[name] = {}
                # Move the pointer to the next nested dictionary
                current_level_ptr = current_level_ptr[name]

    return nested_results

# Example usage:
out = parse_results(validation_dict, detail='grid')
out

[{'model': 'AWI-CM-1-1-MR',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'variable': 'hus',
  'grid': 'gn'},
 {'model': 'AWI-CM-1-1-MR',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'variable': 'psl',
  'grid': 'gn'},
 {'model': 'AWI-CM-1-1-MR',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'variable': 'ta',
  'grid': 'gn'},
 {'model': 'AWI-CM-1-1-MR',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'variable': 'tos',
  'grid': 'gn'},
 {'model': 'AWI-CM-1-1-MR',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'variable': 'ua',
  'grid': 'gn'},
 {'model': 'AWI-CM-1-1-MR',
  'variant': 'r1i1p1f1',
  'scenario': 'historical',
  'variable': 'va',
  'grid': 'gn'},
 {'model': 'AWI-CM-1-1-MR',
  'variant': 'r1i1p1f1',
  'scenario': 'ssp126',
  'variable': 'hus',
  'grid': 'gn'},
 {'model': 'AWI-CM-1-1-MR',
  'variant': 'r1i1p1f1',
  'scenario': 'ssp126',
  'variable': 'psl',
  'grid': 'gn'},
 {'model': 'AWI-CM-1-1-MR',
  'variant': 'r1i1p1f1',
  'sce

In [47]:
nest_parsed_results(out, DATA_TYPE)

{'AWI-CM-1-1-MR': {'r1i1p1f1': {'historical': {'hus': 'gn',
    'psl': 'gn',
    'ta': 'gn',
    'tos': 'gn',
    'ua': 'gn',
    'va': 'gn'},
   'ssp126': {'hus': 'gn',
    'psl': 'gn',
    'ta': 'gn',
    'tos': 'gn',
    'ua': 'gn',
    'va': 'gn'},
   'ssp245': {'hus': 'gn',
    'psl': 'gn',
    'ta': 'gn',
    'tos': 'gn',
    'ua': 'gn',
    'va': 'gn'},
   'ssp585': {'hus': 'gn',
    'psl': 'gn',
    'ta': 'gn',
    'tos': 'gn',
    'ua': 'gn',
    'va': 'gn'}}},
 'IPSL-CM6A-LR': {'r1i1p1f1': {'historical': {'hus': 'gr',
    'psl': 'gr',
    'ta': 'gr',
    'tos': 'gn',
    'ua': 'gr',
    'va': 'gr'},
   'ssp126': {'hus': 'gr',
    'psl': 'gr',
    'ta': 'gr',
    'tos': 'gn',
    'ua': 'gr',
    'va': 'gr'},
   'ssp245': {'hus': 'gr',
    'psl': 'gr',
    'ta': 'gr',
    'tos': 'gn',
    'ua': 'gr',
    'va': 'gr'},
   'ssp585': {'hus': 'gr',
    'psl': 'gr',
    'ta': 'gr',
    'tos': 'gn',
    'ua': 'gr',
    'va': 'gr'}}},
 'MRI-ESM2-0': {'r1i1p1f1': {'historical': {'hus': 

In [4]:
raw_results_for_tasks


{'AWI-CM-1-1-MR': {'complete': True,
  'issues': [],
  'variant': {'r1i1p1f1': {'complete': True,
    'issues': [],
    'scenario': {'historical': {'complete': True,
      'issues': [],
      'variable': {'hus': {'complete': True,
        'issues': [],
        'grid': {'gn': {'complete': True,
          'issues': [],
          'frequency': {'Amon': {'complete': True,
            'files': [{'path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/AWI-CM-1-1-MR/r1i1p1f1/historical/hus/gn/Amon/hus_Amon_AWI-CM-1-1-MR_historical_r1i1p1f1_gn_197001-197012.nc',
              'fill_required': False},
             {'path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/AWI-CM-1-1-MR/r1i1p1f1/historical/hus/gn/Amon/hus_Amon_AWI-CM-1-1-MR_historical_r1i1p1f1_gn_197101-197112.nc',
              'fill_required': False},
             {'path': '/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/AWI-CM-1-1-MR/r1i1p1f1/historical/hus/gn/Amon/hus_Amon_AWI-CM-1-1-MR_histor

In [None]:


# 5. Generate the final list of file-level tasks using the filtered results
# The flattening function now needs to be able to dynamically read the structure
tasks_to_run = list(flatten_validation_results(
    raw_results_for_tasks, 
    data_type=DATA_TYPE 
))

# 6. Check for the warning case: Models complete in processed but NOT in raw
processed_only_models = processed_complete_models - raw_complete_models

if processed_only_models:
    print("\n⚠️ WARNING: Found PROCESSED models that are NOT complete in RAW data:")
    print(f"   {processed_only_models}")
    print("   These models may have been processed from incomplete source data.")


# 7. Final Summary
print("\n" + "=" * 80)
print(f"SUMMARY: {len(raw_complete_models)} complete RAW models found.")
print(f"         {len(processed_complete_models)} complete PROCESSED models found.")
print(f"         {len(models_to_process)} unique models require processing.")
print(f"         Resulting in {len(tasks_to_run)} files to process.")
print("=" * 80)


if not tasks_to_run:
    print("\n✅ Execution halted: All required files have either been processed or are incomplete in the raw data.")
    exit(0)
    


STEP 1.1: Validating RAW models for task creation (Source: cmip6).
[DATA] Starting full validation for: cmip6...
[1/27] Validating ACCESS-CM2... [2/27] Validating ACCESS-ESM1-5... [3/27] Validating AWI-CM-1-1-MR... [4/27] Validating BCC-CSM2-MR... [5/27] Validating CESM2... [6/27] Validating CESM2-WACCM... [7/27] Validating CMCC-CM2-SR5... [8/27] Validating CMCC-ESM2... [9/27] Validating CanESM5... [10/27] Validating EC-Earth3... [11/27] Validating EC-Earth3-Veg... [12/27] Validating EC-Earth3-Veg-LR... [13/27] Validating FGOALS-g3... [14/27] Validating IITM-ESM... [15/27] Validating INM-CM4-8... [16/27] Validating INM-CM5-0... [17/27] Validating IPSL-CM6A-LR... [18/27] Validating KACE-1-0-G... [19/27] Validating MIROC6... [20/27] Validating MPI-ESM1-2-HR... [21/27] Validating MPI-ESM1-2-LR... [22/27] Validating MRI-ESM2-0... [23/27] Validating NESM3... [24/27] Validating NorESM2-LM... [25/27] Validating NorESM2-MM... [26/27] Validating TaiESM1... [27/27] Validating validation_cache..

: 