In [1]:
import netCDF4 as nc

In [2]:
# List of your NetCDF file paths
file_paths = [
    "2024_air_pressure_at_sea_level.nc",
    "2024_geopotential_height.nc",
    "2024_wbpt.nc"
]

In [3]:
# Loop through each file and print metadata
for path in file_paths:
    print(f"\n Inspecting file: {path}")
    try:
        ds = nc.Dataset(path, mode='r')
        print(f"  ✅ Opened successfully.\n")

        for var_name in ds.variables:
            var = ds.variables[var_name]
            print(f"Variable: {var_name}")
            print(f"  - Type: {type(var)}")
            print(f"  - Dimensions: {var.dimensions}")
            print(f"  - Shape: {var.shape}")
            print(f"  - Attributes:")
            for attr_name in var.ncattrs():
                print(f"    • {attr_name}: {getattr(var, attr_name)}")
            print("")

        ds.close()

    except Exception as e:
        print(f"Failed to open {path}: {e}")


 Inspecting file: 2024_air_pressure_at_sea_level.nc
Failed to open 2024_air_pressure_at_sea_level.nc: [Errno -51] NetCDF: Unknown file format: '2024_air_pressure_at_sea_level.nc'

 Inspecting file: 2024_geopotential_height.nc
  ✅ Opened successfully.

Variable: geopotential_height
  - Type: <class 'netCDF4._netCDF4.Variable'>
  - Dimensions: ('time', 'latitude', 'longitude')
  - Shape: (447, 121, 161)
  - Attributes:
    • standard_name: geopotential_height
    • units: m
    • grid_mapping: latitude_longitude
    • coordinates: forecast_period originating_centre pressure

Variable: latitude_longitude
  - Type: <class 'netCDF4._netCDF4.Variable'>
  - Dimensions: ()
  - Shape: ()
  - Attributes:
    • grid_mapping_name: latitude_longitude
    • longitude_of_prime_meridian: 0.0
    • earth_radius: 6371229.0

Variable: time
  - Type: <class 'netCDF4._netCDF4.Variable'>
  - Dimensions: ('time',)
  - Shape: (447,)
  - Attributes:
    • axis: T
    • units: hours since 1970-01-01 00:00:00
 

In [4]:
# List of your NetCDF file paths
dict_path = {
    "2024_air_pressure_at_sea_level.nc" : "cfgrib",
    "2024_geopotential_height.nc" : "netcdf4",
    "2024_wbpt.nc" : "netcdf4"
}

In [5]:
dict_path

{'2024_air_pressure_at_sea_level.nc': 'cfgrib',
 '2024_geopotential_height.nc': 'netcdf4',
 '2024_wbpt.nc': 'netcdf4'}

In [6]:
import xarray as xr
import numpy as np
import pandas as pd
import os

## Extracting 2-day slices

## Extract slices on a daily basics

In [10]:
# Load datasets and collect time info
datasets = {}
start_dates = []
end_dates = []

for path, engine in file_info.items():
    print(f"\n Opening: {path} with engine: {engine}")
    ds = xr.open_dataset(path, engine=engine)
    datasets[path] = ds
    time_vals = ds['time'].values
    start_dates.append(time_vals[0])
    end_dates.append(time_vals[-1])
    print(f"Time range: {time_vals[0]} to {time_vals[-1]} (n={len(time_vals)})")

# Determine common time window
common_start = max(start_dates)
common_end = min(end_dates)
print(f"\n Common time window: {common_start} to {common_end}")

# Define chunk size and extract matching time indices
step = 8  # 2 days = 8 x 6-hour intervals
stride = 4  # 1 day overlap = 4 steps
ref_ds = list(datasets.values())[0]
valid_times = ref_ds['time'].sel(time=slice(common_start, common_end))

print(f"Total available time steps: {len(valid_times)}")

# Create output directory
os.makedirs("npy_output", exist_ok=True)

# Process and save each overlapping chunk
i = 0
while True:
    i_start = i * stride
    i_end = i_start + step
    if i_end > len(valid_times):
        break  # Not enough time steps for another full chunk

    chunk_times = valid_times[i_start:i_end]
    chunk_start_str = pd.to_datetime(str(chunk_times[0].values)).strftime('%Y%m%dT%H%M')

    arrays = []  # Will hold arrays from all variables

    for path, ds in datasets.items():
        name = os.path.splitext(os.path.basename(path))[0]
        sliced = ds.sel(time=chunk_times)

        for var in ds.data_vars:
            arr = sliced[var].values.astype(np.float32)
            if arr.ndim < 3:
                continue  # Skip scalars or non-3D variables
            arrays.append(arr)

    # Stack and save the combined array
    d = np.stack(arrays, axis=0)
    stacked_filename = f"npy_output/{chunk_start_str}_stacked.npy"
    np.save(stacked_filename, d)
    print(f"Saved stacked array: {stacked_filename} | shape = {d.shape}")

    i += 1  # Advance by 1 day (4 timesteps)


 Opening: 2024_air_pressure_at_sea_level.nc with engine: cfgrib
Time range: 2024-01-01T00:00:00.000000000 to 2024-04-21T12:00:00.000000000 (n=447)

 Opening: 2024_geopotential_height.nc with engine: netcdf4
Time range: 2024-01-01T00:00:00.000000000 to 2024-04-21T12:00:00.000000000 (n=447)

 Opening: 2024_wbpt.nc with engine: netcdf4
Time range: 2024-01-01T00:00:00.000000000 to 2024-04-22T18:00:00.000000000 (n=452)

 Common time window: 2024-01-01T00:00:00.000000000 to 2024-04-21T12:00:00.000000000
Total available time steps: 447
Saved stacked array: npy_output/20240101T0000_stacked.npy | shape = (3, 8, 121, 161)
Saved stacked array: npy_output/20240102T0000_stacked.npy | shape = (3, 8, 121, 161)
Saved stacked array: npy_output/20240103T0000_stacked.npy | shape = (3, 8, 121, 161)
Saved stacked array: npy_output/20240104T0000_stacked.npy | shape = (3, 8, 121, 161)
Saved stacked array: npy_output/20240105T0000_stacked.npy | shape = (3, 8, 121, 161)
Saved stacked array: npy_output/2024010

In [11]:
import re

output_dir = "npy_output"

# Pattern to match your stacked files, e.g. stacked like "20240101T0000_stacked.npy"
pattern = re.compile(r"(\d{8})T\d{4}_stacked\.npy$")

rows = []
for fname in os.listdir(output_dir):
    if fname.endswith("_stacked.npy"):
        match = pattern.search(fname)
        if match:
            date_str = match.group(1)  # YYYYMMDD
            date_formatted = pd.to_datetime(date_str, format='%Y%m%d').date()  # date object
            path = os.path.join(output_dir, fname)
            rows.append({
                "description": "",  # blank for now
                "path": path,
                "date": date_formatted
            })

# Create DataFrame
df = pd.DataFrame(rows, columns=["description", "path", "date"])

# Save to CSV if you want
df.to_csv("stacked_files_table.csv", index=False)

print(df)

    description                                  path        date
0                npy_output/20240101T0000_stacked.npy  2024-01-01
1                npy_output/20240102T0000_stacked.npy  2024-01-02
2                npy_output/20240103T0000_stacked.npy  2024-01-03
3                npy_output/20240104T0000_stacked.npy  2024-01-04
4                npy_output/20240105T0000_stacked.npy  2024-01-05
..          ...                                   ...         ...
105              npy_output/20240415T0000_stacked.npy  2024-04-15
106              npy_output/20240416T0000_stacked.npy  2024-04-16
107              npy_output/20240417T0000_stacked.npy  2024-04-17
108              npy_output/20240418T0000_stacked.npy  2024-04-18
109              npy_output/20240419T0000_stacked.npy  2024-04-19

[110 rows x 3 columns]
