In [1]:
import importlib
import sys
import getpass
import uuid
import os
from jobmon.client.tool import Tool 
from pathlib import Path

# NOTE: These imports rely on external module definitions (constants, io_compare_utils, etc.)
import idd_climate_models.constants as rfc
from idd_climate_models.io_compare_utils import compare_model_validation
from idd_climate_models.dictionary_utils import parse_results
from idd_climate_models.resource_functions import get_rep_file_size_gb, get_resource_tier

# --- CONSTANT DEFINITIONS (from rfc) ---
repo_name = rfc.repo_name
package_name = rfc.package_name
DATA_DIR = rfc.RAW_DATA_PATH
PROCESSED_DATA_PATH = rfc.PROCESSED_DATA_PATH
TC_RISK_INPUT_PATH = rfc.TC_RISK_INPUT_PATH
SCRIPT_ROOT = rfc.REPO_ROOT / repo_name / "src" / package_name / "01_run_tc_risk"

# Configuration
DATA_SOURCE = "cmip6"
BIN_SIZE_YEARS = 20
DRY_RUN = False # Assuming DRY_RUN is generally False for submission
RERUN = False


INPUT_DATA_TYPE = "data"
INPUT_IO_TYPE = "processed"
OUTPUT_DATA_TYPE = "tc_risk"
OUTPUT_IO_TYPE = "input"



# ============================================================================
# DATA SETUP & VALIDATION
# ============================================================================

# Use the unified function for validation and comparison
validation_info = compare_model_validation(
    input_data_type=INPUT_DATA_TYPE,
    input_io_type=INPUT_IO_TYPE,
    output_data_type=OUTPUT_DATA_TYPE,
    output_io_type=OUTPUT_IO_TYPE,
    data_source=DATA_SOURCE,
    verbose=False
)

models_to_process = validation_info["models_to_process"]
model_variants_to_run = parse_results(validation_info["models_to_process_dict"], 'variant')

# Get the full hierarchy list to build the variable detail map
full_path_list = parse_results(validation_info["models_to_process_dict"], 'all')
variable_detail_map = {}

# Build the map: {(model, variant, scenario, variable): {'grid': 'gn', 'frequency': 'day'}}
for item in full_path_list:
    key = (
        item['model'],
        item['variant'],
        item['scenario'],
        item['variable']
    )
    # Store the unique grid and frequency needed to build the source_dir
    variable_detail_map[key] = {
        'grid': item['grid'],
        'frequency': item['frequency']
    }

def get_time_bins(scenario_name, bin_size_years):
    date_ranges = rfc.VALIDATION_RULES['tc_risk']['time-period']['date_ranges']
    if scenario_name not in date_ranges:
        print(f"Warning: No date range found for scenario '{scenario_name}'")
        return []
    start_year, end_year = date_ranges[scenario_name]
    return [(y, min(y + bin_size_years - 1, end_year)) for y in range(start_year, end_year + 1, bin_size_years)]

TIME_BINS = {
    scenario: get_time_bins(scenario, BIN_SIZE_YEARS)
    for scenario in rfc.SCENARIOS
}



Validation complete for: data, processed, cmip6
Summary: 22/23 models complete. Parsed log (up to 'grid') written to /mnt/team/rapidresponse/pub/tropical-storms/data/processed/cmip6/validation_log.json

Validation complete for: tc_risk, input, cmip6
Summary: 22/23 models complete. Parsed log (up to 'scenario') written to /mnt/team/rapidresponse/pub/tropical-storms/tc_risk/input/cmip6/validation_log.json


In [2]:
# Assuming validation_info exists from the pipeline run
from idd_climate_models.dictionary_utils import summarize_all_failures

print("\n" + "=" * 80)
print("ANALYZING FAILURES FOR INCORRECTLY FILTERED MODELS")
print("=" * 80)

# Summarize the failures from the input validation dictionary
failure_summary = summarize_all_failures(validation_info['input_validation_dict'])

# Print the specific issues
for model, summary in failure_summary.items():
    print(f"Model: {model} -> {summary}")

print("=" * 80)


ANALYZING FAILURES FOR INCORRECTLY FILTERED MODELS

ANALYZING FAILURES (Highest-Level Issue Per Incomplete Model)
✗ INCOMPLETE: Validation Failed at: **Model=AWI-CM-1-1-MR -> variant=r1i1p1f1 -> scenario=historical -> variable=tos -> grid=gn -> frequency=Omon**
Issues (1): Forbidden unstructured grid dimensions found: ['ncells']. Model is incompatible with the target TC risk grid.
Model: AWI-CM-1-1-MR -> ✗ INCOMPLETE: Validation Failed at: **Model=AWI-CM-1-1-MR -> variant=r1i1p1f1 -> scenario=historical -> variable=tos -> grid=gn -> frequency=Omon**
Issues (1): Forbidden unstructured grid dimensions found: ['ncells']. Model is incompatible with the target TC risk grid.


In [2]:
validation_info['output_validation_dict']['validation_results']['AWI-CM-1-1-MR']['variant']['r1i1p1f1']['scenario']['ssp245']['time-period']['2095-2100']

{'complete': False,
 'files': [{'path': '/mnt/team/rapidresponse/pub/tropical-storms/tc_risk/input/cmip6/AWI-CM-1-1-MR/r1i1p1f1/ssp245/2095-2100/hus_Amon_AWI-CM-1-1-MR_ssp245_r1i1p1f1_gn_209501-210012.nc'},
  {'path': '/mnt/team/rapidresponse/pub/tropical-storms/tc_risk/input/cmip6/AWI-CM-1-1-MR/r1i1p1f1/ssp245/2095-2100/psl_Amon_AWI-CM-1-1-MR_ssp245_r1i1p1f1_gn_209501-210012.nc'},
  {'path': '/mnt/team/rapidresponse/pub/tropical-storms/tc_risk/input/cmip6/AWI-CM-1-1-MR/r1i1p1f1/ssp245/2095-2100/ta_Amon_AWI-CM-1-1-MR_ssp245_r1i1p1f1_gn_209501-210012.nc'},
  {'path': '/mnt/team/rapidresponse/pub/tropical-storms/tc_risk/input/cmip6/AWI-CM-1-1-MR/r1i1p1f1/ssp245/2095-2100/ua_day_AWI-CM-1-1-MR_ssp245_r1i1p1f1_gn_20950101-21001231.nc'},
  {'path': '/mnt/team/rapidresponse/pub/tropical-storms/tc_risk/input/cmip6/AWI-CM-1-1-MR/r1i1p1f1/ssp245/2095-2100/va_day_AWI-CM-1-1-MR_ssp245_r1i1p1f1_gn_20950101-21001231.nc'}],
 'issues': ["Missing required variable files: ['tos']"]}

In [1]:
# CELL 1: Setup and Configuration (Modified to select 5 files)
import xarray as xr
import os
from pathlib import Path
from memory_profiler import memory_usage
import time

# --- Path to the Directory Containing the Yearly Files ---
FILE_PATH = Path("/mnt/team/rapidresponse/pub/tropical-storms/data/processed/cmip6/AWI-CM-1-1-MR/r1i1p1f1/historical/tos/gn/Omon")
OUTPUT_FILE = Path("./temp_tos_combined_test.nc")

# We select the first 5 yearly files for the test (simulating a 5-year bin)
file_list = sorted([f for f in os.listdir(FILE_PATH) if f.endswith('.nc')])
files_to_process = file_list[:5] # <-- Changed to 5 files (1 per year)

print(f"Total files found in source: {len(file_list)}")
print(f"Files selected for 5-year bin (tos Omon): {len(files_to_process)}")

# Test the size of a single file (should be ~169MB)
if files_to_process:
    single_file_size_gb = os.path.getsize(FILE_PATH / files_to_process[0]) / (1024**3)
    print(f"Single file size: {single_file_size_gb:.3f} GiB")
else:
    raise FileNotFoundError("No files found to process.")

Total files found in source: 54
Files selected for 5-year bin (tos Omon): 5
Single file size: 0.158 GiB


In [5]:
# CELL 1: Load a Single File and Inspect Metadata

import xarray as xr
from pathlib import Path
import os

# --- Path to the ACCESS-CM2 Directory ---
FILE_PATH = Path("/mnt/team/rapidresponse/pub/tropical-storms/data/processed/cmip6/ACCESS-CM2/r1i1p1f1/historical/tos/gn/Omon")
file_list = sorted([f for f in os.listdir(FILE_PATH) if f.endswith('.nc')])
first_file = FILE_PATH / file_list[0]

print(f"Inspecting file: {first_file.name}")

# 1. Open a single dataset lazily
ds = xr.open_dataset(first_file)

print(f"\n--- Dataset Dimensions & Variables for ACCESS-CM2 ---")
print(ds)

print(f"\n--- Coordinates and Their Shapes ---")
# Check the shape of the lat/lon arrays. If they are simple 1D arrays, memory overhead is low.
for name, data_array in ds.coords.items():
    print(f"Coordinate: {name}, Shape: {data_array.shape}, Dims: {data_array.dims}")

print(f"\n--- Check Memory Estimate of Single File ---")
print(f"Total estimated memory (raw data): {ds.nbytes / (1024**3):.3f} GiB")

Inspecting file: tos_Omon_ACCESS-CM2_historical_r1i1p1f1_gn_195001-195012.nc

--- Dataset Dimensions & Variables for ACCESS-CM2 ---
<xarray.Dataset> Size: 14MB
Dimensions:             (time: 12, bnds: 2, j: 300, i: 360, vertices: 4)
Coordinates:
  * time                (time) datetime64[ns] 96B 1950-01-16T12:00:00 ... 195...
  * j                   (j) int32 1kB 0 1 2 3 4 5 6 ... 294 295 296 297 298 299
  * i                   (i) int32 1kB 0 1 2 3 4 5 6 ... 354 355 356 357 358 359
    latitude            (j, i) float64 864kB ...
    longitude           (j, i) float64 864kB ...
Dimensions without coordinates: bnds, vertices
Data variables:
    time_bnds           (time, bnds) datetime64[ns] 192B ...
    vertices_latitude   (j, i, vertices) float64 3MB ...
    vertices_longitude  (j, i, vertices) float64 3MB ...
    tos                 (time, j, i) float32 5MB ...
Attributes: (12/47)
    Conventions:            CF-1.7 CMIP-6.2
    activity_id:            CMIP
    branch_method:         

In [6]:
# CELL 1: Load a Single File and Inspect Metadata

import xarray as xr
from pathlib import Path

FILE_PATH = Path("/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/AWI-CM-1-1-MR/r1i1p1f1/historical/tos/gn/Omon")
file_list = sorted([f for f in os.listdir(FILE_PATH) if f.endswith('.nc')])
first_file = FILE_PATH / file_list[0]

# 1. Open a single dataset lazily
ds = xr.open_dataset(first_file)

print(f"--- Dataset Dimensions & Variables ---")
print(ds)

print(f"\n--- Coordinates and Their Shapes ---")
# Identify any coordinates or variables that have a large number of indices or dimensions.
# Look for large arrays that are not the main 'tos' data variable.
for name, data_array in ds.coords.items():
    print(f"Coordinate: {name}, Shape: {data_array.shape}, Dims: {data_array.dims}")

print(f"\n--- Check Memory Estimate of Single File ---")
# This estimates the memory of the actual data, NOT the overhead.
# Look for a large discrepancy between the .nbytes estimate and the file size (0.165 GiB).
print(f"Total estimated memory (raw data): {ds.nbytes / (1024**3):.3f} GiB")

--- Dataset Dimensions & Variables ---
<xarray.Dataset> Size: 624MB
Dimensions:    (time: 120, bnds: 2, ncells: 830305, vertices: 16)
Coordinates:
  * time       (time) datetime64[ns] 960B 1961-01-16T12:00:00 ... 1970-12-16T...
    lat        (ncells) float64 7MB ...
    lon        (ncells) float64 7MB ...
Dimensions without coordinates: bnds, ncells, vertices
Data variables:
    time_bnds  (time, bnds) datetime64[ns] 2kB ...
    tos        (time, ncells) float32 399MB ...
    lat_bnds   (ncells, vertices) float64 106MB ...
    lon_bnds   (ncells, vertices) float64 106MB ...
Attributes: (12/39)
    frequency:              mon
    activity_id:            CMIP
    Conventions:            CF-1.7 CMIP-6.2
    creation_date:          2018-12-18T12:00:00Z
    data_specs_version:     01.00.27
    experiment:             historical
    ...                     ...
    parent_activity_id:     CMIP
    parent_experiment_id:   piControl
    parent_mip_era:         CMIP6
    parent_source_id:      

In [8]:
import xarray as xr
from pathlib import Path
import os

FILE_PATH = Path("/mnt/team/rapidresponse/pub/tropical-storms/data/processed/cmip6/AWI-CM-1-1-MR/r1i1p1f1/historical/tos/gn/Omon")
file_list = sorted([f for f in os.listdir(FILE_PATH) if f.endswith('.nc')])
first_file = FILE_PATH / file_list[0]

# 1. Open the dataset lazily
ds_awi = xr.open_dataset(first_file)

# 2. Extract the coordinate arrays (these are large 1D arrays of size 830,305)
awi_latitudes = ds_awi['lat'].values
awi_longitudes = ds_awi['lon'].values

print(f"Total number of grid cells (ncells): {len(awi_latitudes)}")
print(f"Shape of latitude array: {awi_latitudes.shape}")
print(f"Shape of longitude array: {awi_longitudes.shape}")
print(f"Number of unique latitudes: {len(set(awi_latitudes))}")
print(f"Number of unique longitudes: {len(set(awi_longitudes))}")

Total number of grid cells (ncells): 830305
Shape of latitude array: (830305,)
Shape of longitude array: (830305,)
Number of unique latitudes: 830305
Number of unique longitudes: 830305


In [None]:
# Can I plot the lat/lon arrays to visualize their distribution?
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.scatter(awi_longitudes, awi_latitudes, s=1)
plt.title("Scatter Plot of AWI-CM-1-1-MR Grid Points")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

In [None]:
import xarray as xr
import matplotlib.pyplot as plt
import os
from pathlib import Path
import numpy as np # Used for finding global min/max

# --- Configuration ---
# Use the raw data path to get a full file for analysis
FILE_PATH = Path("/mnt/team/rapidresponse/pub/tropical-storms/data/raw/cmip6/AWI-CM-1-1-MR/r1i1p1f1/historical/tos/gn/Omon")

# --- Processing ---

try:
    file_list = sorted([f for f in os.listdir(FILE_PATH) if f.endswith('.nc')])
    if not file_list:
        print(f"Error: No NetCDF files found in {FILE_PATH}")
        exit()
        
    first_file = FILE_PATH / file_list[0]
    
    # Open the dataset, loading the first 12 months (or fewer if the file is shorter)
    with xr.open_dataset(first_file) as ds_awi:
        # Select the first 12 months for plotting (the first year)
        # We must compute() here to pull the coordinates and tos data into memory for plotting
        ds_12_months = ds_awi.isel(time=slice(0, 12)).compute()

        # Extract the static coordinates (lat/lon are independent of time)
        awi_latitudes = ds_12_months['lat'].values
        awi_longitudes = ds_12_months['lon'].values
        
        # Extract the temperature data (tos) for all 12 months
        awi_tos_data_all = ds_12_months['tos']
        
        # Determine the global min/max for a single, consistent color scale
        global_vmin = awi_tos_data_all.min().item()
        global_vmax = awi_tos_data_all.max().item()

        # --- Visualization (3x4 Multi-Plot Grid) ---
        
        # Create a figure with 3 rows and 4 columns, sharing the coordinate axes
        fig, axes = plt.subplots(
            nrows=3, 
            ncols=4, 
            figsize=(20, 12), # Increased size for readability
            sharex=True, 
            sharey=True
        )
        # Flatten the 3x4 array of axes for easy iteration
        axes = axes.flatten()
        
        fig.suptitle(f"AWI-CM-1-1-MR Unstructured Grid: TOS Across 12 Months", fontsize=20, y=0.95)

        # Loop through all 12 time steps (months)
        for i, ax in enumerate(axes):
            
            # Select the data for the current month
            tos_data_month = awi_tos_data_all.isel(time=i).values
            month_label = ds_12_months.time.dt.strftime('%Y-%m').isel(time=i).item()
            
            # Plot the unstructured data for this month
            scatter = ax.scatter(
                awi_longitudes, 
                awi_latitudes, 
                s=1, # Small size for performance
                c=tos_data_month, 
                cmap='coolwarm', 
                vmin=global_vmin, # Use global bounds
                vmax=global_vmax
            )
            
            # Set the title for the subplot
            ax.set_title(month_label, fontsize=12)
            ax.set_aspect('equal', adjustable='box') # Keep aspect ratio for spatial data
            ax.tick_params(labelsize=8) # Smaller ticks for subplots
            ax.grid(True, alpha=0.3)
            
            # Label the outer rows/columns
            if i % 4 == 0:
                ax.set_ylabel("Latitude")
            if i >= 8:
                ax.set_xlabel("Longitude")


        # --- Add a Single Colorbar for the Entire Figure ---
        # The colorbar is created using the last scatter object and placed in an external axis
        # (This is a common trick for shared colorbars in grids)
        cbar_ax = fig.add_axes([0.92, 0.1, 0.02, 0.75]) # [left, bottom, width, height]
        fig.colorbar(scatter, cax=cbar_ax, label="Sea Surface Temperature (TOS)")

        plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust layout to make room for the colorbar
        plt.show()

except Exception as e:
    print(f"An error occurred during processing: {e}")

In [3]:
# CELL 2: Open Datasets (Memory Inspection Point 1)

# This step loads all metadata and builds the Dask graph. Memory usage spikes here.
print("Starting memory intensive operation: Opening and building Dask graph...")

# Use a memory profiler wrapper to track peak memory consumption
def load_and_concat(file_list):
    datasets = [xr.open_dataset(FILE_PATH / f) for f in file_list]
    # The moment xr.concat is called, the full Dask graph is built
    combined_ds = xr.concat(datasets, dim='time', data_vars='all').sortby('time')
    return combined_ds

# Run the task and track memory (measured in MiB by memory_usage)
# If the previous attempts failed at 20G, the peak_mem here should exceed 20480 MiB.
peak_mem, combined_ds = memory_usage((load_and_concat, (files_to_process,)), 
                                       interval=0.1, max_usage=True, retval=True)

print(f"\nMemory Tracking Results (MiB):")
print(f"Peak Memory Usage (MiB): {peak_mem:.2f}")
print(f"Peak Memory Usage (GiB): {(peak_mem / 1024):.2f}")

Starting memory intensive operation: Opening and building Dask graph...

Memory Tracking Results (MiB):
Peak Memory Usage (MiB): 37528.29
Peak Memory Usage (GiB): 36.65


In [None]:
# CELL 3: Final Write (Memory Inspection Point 2)

# Write the data with compression. This forces computation of the Dask graph.
encoding = {
    var: {'zlib': True, 'complevel': 7} 
    for var in combined_ds.data_vars
}

print("Starting disk write with compression...")
start_time = time.time()
combined_ds.to_netcdf(OUTPUT_FILE, encoding=encoding, engine='netcdf4')
end_time = time.time()

print(f"Write successful: {OUTPUT_FILE}")
print(f"Time taken: {(end_time - start_time):.2f} seconds")

In [2]:
import os
import sys
from pathlib import Path

# --- Imports needed for the test ---
# NOTE: These imports rely on your custom package structure
try:
    import idd_climate_models.constants as rfc
    # These functions must exist in your current validation_functions.py
    from idd_climate_models.validation_functions import validate_model_in_source
except ImportError as e:
    print("FATAL: Cannot import project modules. Ensure python path is correct.")
    print(f"Error: {e}")
    sys.exit(1)

# --- Mock Data to Simulate AWI Model ---
MODEL_NAME = "AWI-CM-1-1-MR"
SOURCE_PATH = str(rfc.PROCESSED_DATA_PATH / rfc.DATA_SOURCE) # e.g., /.../data/processed/cmip6

# --- Specific AWI file path (used only for reference) ---
# The validation system will internally look at this path structure:
# SOURCE_PATH/MODEL_NAME/.../frequency_folder
DUMMY_DATA_TYPE = "data"
DUMMY_DATA_SOURCE = "cmip6"

AttributeError: module 'idd_climate_models.constants' has no attribute 'DATA_SOURCE'

In [None]:


# =========================================================================
# STEP 1: Execute the single-model validation with the strict flag ON
# =========================================================================

print(f"Starting isolated validation check for model: {MODEL_NAME}")
print(f"Source path: {SOURCE_PATH}")
print("-" * 50)

# We force the strict_grid_check=True flag here.
result = validate_model_in_source(
    model_name=MODEL_NAME,
    source_path=SOURCE_PATH,
    data_type=DUMMY_DATA_TYPE,
    data_source=DUMMY_DATA_SOURCE,
    strict_grid_check=True # <--- CRITICAL FLAG IS ON
)

# =========================================================================
# STEP 2: Analyze the result
# =========================================================================

print("\n--- Validation Result Summary ---")
print(f"Model Complete Status: {result.get('complete', 'N/A')}")
print(f"Top-level Issues ({len(result.get('issues', []))}): {result.get('issues', [])}")

# Look deep into the nested results to find the specific failure reason
fail_count = 0
for variant, v_data in result.get('variant', {}).items():
    for scenario, s_data in v_data.get('scenario', {}).items():
        # Check all children (variables/frequencies) for issues
        for child_name, child_data in s_data.items():
            if child_name == 'variable': # Variable layer
                for var, var_data in child_data.items():
                    for grid, grid_data in var_data.get('grid', {}).items():
                        for freq, freq_data in grid_data.get('frequency', {}).items():
                            if not freq_data['complete']:
                                fail_count += 1
                                print(f"  FAILURE at {variant}/{scenario}/{var}/{freq}: {freq_data.get('issues', ['No Issues Found?'])}")

if fail_count > 0:
    print(f"\n❌ FAILED: Found {fail_count} failing sub-runs for {MODEL_NAME}.")
else:
    print("\n✅ SUCCESS: Model structure passed the strict grid check.")