In [1]:
import os
import tarfile
import gzip
import shutil
import pandas as pd
import numpy as np
import xarray as xr

In [2]:
def orca(var, yyyy, doy0, temp_dir):
    """
    Load an ORCA file from the temp directory.
    """
    file = os.path.join(temp_dir, f"{var}.{yyyy}{doy0}.hdf")
    if not os.path.exists(file):
        raise FileNotFoundError(f"File not found: {file}")

    data = xr.open_dataset(file, engine='netcdf4')
    data = data.rename({'fakeDim1': 'lon', 'fakeDim0': 'lat'})

    hycom_lon = np.linspace(-180, 180, 4320)
    hycom_lat = np.linspace(89.96, -89.96, 2160)

    data = data.assign_coords(lon=("lon", hycom_lon), lat=("lat", hycom_lat))
    data = data.reindex(lat=list(reversed(data.lat)))
    data = data.squeeze()
    data = data.where(data > -9999, np.nan)

    return data

In [3]:
def decompress_gz_files(folder):
    """
    Decompress all .gz files in the specified folder.
    """
    for file in os.listdir(folder):
        if file.endswith('.gz'):
            gz_path = os.path.join(folder, file)
            hdf_path = os.path.splitext(gz_path)[0]  # Remove .gz extension
            with gzip.open(gz_path, 'rb') as gz_file:
                with open(hdf_path, 'wb') as hdf_file:
                    shutil.copyfileobj(gz_file, hdf_file)
            os.remove(gz_path)  # Remove the .gz file after decompressing
            print(f"Decompressed: {gz_path} -> {hdf_path}")

def process_year_data(year, temp_dir, output_dir_base="cafe_inputs"):
    """
    Process data for a specific year from tar files and save to NetCDF.
    """
    ts = pd.date_range(f"{year}-01-01", f"{year}-12-31", freq="MS")
    os.makedirs(temp_dir, exist_ok=True)

    # Define variables and corresponding tar file patterns
    variable_tar_files = {
        "PAR": f"par.m.{year}.tar",
        "chl": f"chl.m.{year}.tar",
        "mld": f"mld.hycom_030.{year}.tar",
        "aph_443": f"aph.m.{year}.tar",
        "adg_443": f"adg.m.{year}.tar",
        "bbp_443": f"bbp.m.giop.{year}.tar",
        "bbp_s": f"bbp_s.m.{year}.tar",
        "sst": f"sst.m.{year}.tar",
    }

    # Extract all tar files into the temp directory
    for var, tar_file in variable_tar_files.items():
        tar_path = tar_file
        if not os.path.exists(tar_path):
            print(f"Tar file for {var} in {year} not found: {tar_path}")
            return
        
        with tarfile.open(tar_path, "r") as tar:
            tar.extractall(path=temp_dir)
        print(f"Extracted: {tar_path}")

    # Decompress .gz files in the temp directory
    decompress_gz_files(temp_dir)

    for date0 in ts:
        doy0 = date0.strftime('%j')
        try:
            datasets = {
                "PAR": orca("par", str(year), doy0, temp_dir).par.values,
                "chl": orca("chl", str(year), doy0, temp_dir).chl.values,
                "mld": orca("mld", str(year), doy0, temp_dir).mld.values,
                "aph_443": orca("aph", str(year), doy0, temp_dir).aph.values,
                "adg_443": orca("adg", str(year), doy0, temp_dir).adg.values,
                "bbp_443": orca("bbp", str(year), doy0, temp_dir).bbp.values,
                "bbp_s": orca("bbp_s", str(year), doy0, temp_dir).bbp_s.values,
                "sst": orca("sst", str(year), doy0, temp_dir).sst.values,
            }

            lat = orca("par", str(year), doy0, temp_dir).lat.values
            lon = orca("par", str(year), doy0, temp_dir).lon.values
            lat_grid, lon_grid = np.meshgrid(lat, lon, indexing="ij")

            ds = xr.Dataset(
                data_vars={
                    "PAR": (["lat", "lon"], datasets["PAR"]),
                    "chl": (["lat", "lon"], datasets["chl"]),
                    "mld": (["lat", "lon"], datasets["mld"]),
                    "latitude_grid": (["lat", "lon"], lat_grid),
                    "aph_443": (["lat", "lon"], datasets["aph_443"]),
                    "adg_443": (["lat", "lon"], datasets["adg_443"]),
                    "bbp_443": (["lat", "lon"], datasets["bbp_443"]),
                    "bbp_s": (["lat", "lon"], datasets["bbp_s"]),
                    "sst": (["lat", "lon"], datasets["sst"]),
                    "lats": (["lat"], lat),
                    "lons": (["lon"], lon),
                },
                coords={
                    "lon": ("lon", lon),
                    "lat": ("lat", lat),
                },
                attrs={"description": "CAFE model input."},
            )

            output_dir = os.path.join(output_dir_base, str(year))
            os.makedirs(output_dir, exist_ok=True)
            filename = f"{output_dir}/cafe_in_mld03_{date0.strftime('%Y%m')}.nc"
            ds.to_netcdf(filename)
            print(f"Saved: {filename}")

        except Exception as e:
            print(f"Error processing {date0}: {e}")

    # Clean up temporary directory
    for file in os.listdir(temp_dir):
        os.remove(os.path.join(temp_dir, file))

In [4]:
# Process years as needed
temp_dir= 'temp/'

process_year_data(2007, temp_dir)
process_year_data(2008, temp_dir)
process_year_data(2009, temp_dir)
process_year_data(2010, temp_dir)

Extracted: par.m.2007.tar
Extracted: chl.m.2007.tar
Extracted: mld.hycom_030.2007.tar
Extracted: aph.m.2007.tar
Extracted: adg.m.2007.tar
Extracted: bbp.m.giop.2007.tar
Extracted: bbp_s.m.2007.tar
Extracted: sst.m.2007.tar
Decompressed: temp/par.2007121.hdf.gz -> temp/par.2007121.hdf
Decompressed: temp/aph.2007152.hdf.gz -> temp/aph.2007152.hdf
Decompressed: temp/bbp.2007091.hdf.gz -> temp/bbp.2007091.hdf
Decompressed: temp/mld.2007213.hdf.gz -> temp/mld.2007213.hdf
Decompressed: temp/bbp_s.2007152.hdf.gz -> temp/bbp_s.2007152.hdf
Decompressed: temp/chl.2007152.hdf.gz -> temp/chl.2007152.hdf
Decompressed: temp/sst.2007213.hdf.gz -> temp/sst.2007213.hdf
Decompressed: temp/adg.2007060.hdf.gz -> temp/adg.2007060.hdf
Decompressed: temp/bbp.2007001.hdf.gz -> temp/bbp.2007001.hdf
Decompressed: temp/bbp_s.2007121.hdf.gz -> temp/bbp_s.2007121.hdf
Decompressed: temp/sst.2007244.hdf.gz -> temp/sst.2007244.hdf
Decompressed: temp/sst.2007335.hdf.gz -> temp/sst.2007335.hdf
Decompressed: temp/sst.20

Decompressed: temp/sst.2008001.hdf.gz -> temp/sst.2008001.hdf
Decompressed: temp/par.2008122.hdf.gz -> temp/par.2008122.hdf
Decompressed: temp/mld.2008183.hdf.gz -> temp/mld.2008183.hdf
Decompressed: temp/bbp.2008092.hdf.gz -> temp/bbp.2008092.hdf
Decompressed: temp/aph.2008032.hdf.gz -> temp/aph.2008032.hdf
Decompressed: temp/sst.2008214.hdf.gz -> temp/sst.2008214.hdf
Decompressed: temp/sst.2008306.hdf.gz -> temp/sst.2008306.hdf
Decompressed: temp/par.2008092.hdf.gz -> temp/par.2008092.hdf
Decompressed: temp/adg.2008032.hdf.gz -> temp/adg.2008032.hdf
Decompressed: temp/bbp.2008153.hdf.gz -> temp/bbp.2008153.hdf
Decompressed: temp/bbp.2008122.hdf.gz -> temp/bbp.2008122.hdf
Decompressed: temp/par.2008061.hdf.gz -> temp/par.2008061.hdf
Decompressed: temp/mld.2008214.hdf.gz -> temp/mld.2008214.hdf
Decompressed: temp/mld.2008306.hdf.gz -> temp/mld.2008306.hdf
Decompressed: temp/chl.2008092.hdf.gz -> temp/chl.2008092.hdf
Decompressed: temp/bbp_s.2008092.hdf.gz -> temp/bbp_s.2008092.hdf
Deco

Decompressed: temp/sst.2009091.hdf.gz -> temp/sst.2009091.hdf
Decompressed: temp/chl.2009032.hdf.gz -> temp/chl.2009032.hdf
Decompressed: temp/par.2009305.hdf.gz -> temp/par.2009305.hdf
Decompressed: temp/sst.2009001.hdf.gz -> temp/sst.2009001.hdf
Decompressed: temp/bbp.2009213.hdf.gz -> temp/bbp.2009213.hdf
Decompressed: temp/mld.2009091.hdf.gz -> temp/mld.2009091.hdf
Decompressed: temp/aph.2009032.hdf.gz -> temp/aph.2009032.hdf
Decompressed: temp/adg.2009244.hdf.gz -> temp/adg.2009244.hdf
Decompressed: temp/adg.2009182.hdf.gz -> temp/adg.2009182.hdf
Decompressed: temp/adg.2009335.hdf.gz -> temp/adg.2009335.hdf
Decompressed: temp/aph.2009001.hdf.gz -> temp/aph.2009001.hdf
Decompressed: temp/chl.2009091.hdf.gz -> temp/chl.2009091.hdf
Decompressed: temp/sst.2009032.hdf.gz -> temp/sst.2009032.hdf
Decompressed: temp/bbp_s.2009091.hdf.gz -> temp/bbp_s.2009091.hdf
Decompressed: temp/par.2009060.hdf.gz -> temp/par.2009060.hdf
Decompressed: temp/adg.2009121.hdf.gz -> temp/adg.2009121.hdf
Deco

Decompressed: temp/par.2010001.hdf.gz -> temp/par.2010001.hdf
Decompressed: temp/mld.2010274.hdf.gz -> temp/mld.2010274.hdf
Decompressed: temp/bbp.2010121.hdf.gz -> temp/bbp.2010121.hdf
Decompressed: temp/adg.2010152.hdf.gz -> temp/adg.2010152.hdf
Decompressed: temp/par.2010091.hdf.gz -> temp/par.2010091.hdf
Decompressed: temp/sst.2010305.hdf.gz -> temp/sst.2010305.hdf
Decompressed: temp/bbp_s.2010060.hdf.gz -> temp/bbp_s.2010060.hdf
Decompressed: temp/chl.2010060.hdf.gz -> temp/chl.2010060.hdf
Decompressed: temp/sst.2010274.hdf.gz -> temp/sst.2010274.hdf
Decompressed: temp/bbp.2010091.hdf.gz -> temp/bbp.2010091.hdf
Decompressed: temp/mld.2010213.hdf.gz -> temp/mld.2010213.hdf
Decompressed: temp/aph.2010152.hdf.gz -> temp/aph.2010152.hdf
Decompressed: temp/par.2010121.hdf.gz -> temp/par.2010121.hdf
Decompressed: temp/sst.2010213.hdf.gz -> temp/sst.2010213.hdf
Decompressed: temp/adg.2010060.hdf.gz -> temp/adg.2010060.hdf
Decompressed: temp/bbp.2010001.hdf.gz -> temp/bbp.2010001.hdf
Deco