In [1]:
import pandas as pd
import numpy as np
import os
import glob
from datetime import datetime
import xarray as xr
import rioxarray
import dask
import multiprocessing
from dask.diagnostics import ProgressBar

max_workers = multiprocessing.cpu_count()
dask.config.set(scheduler="threads", num_workers=max_workers)

<dask.config.set at 0x73d5d11f6d60>

In [None]:
merged_coords_1 = pd.read_csv("../data/merged_coords_batch1.csv")
merged_coords_2 = pd.read_csv("../data/merged_coords_batch2.csv")
merged_coords = pd.concat([merged_coords_1, merged_coords_2], ignore_index=True)

In [None]:
name = merged_coords["name"][0]
lat = merged_coords["Lat"][0]
lon = merged_coords["Lon"][0]
base_dir = "/home/hamid/mnt/nas/Hamid/GLASS/EC_SITES/"
years = np.arange(2002, 2022)
fnames = []
dates = []

for year in years:
    dir_path = os.path.join(base_dir, name, str(year))
    tif_files = glob.glob(dir_path + "/*.tif")
    # add and if clause to check if number of files are exactly 46
    for file in tif_files:
        fnames.append(file)
        date_part = file.split("/")[-1].split(".")[2][1:]
        dates.append(datetime.strptime(date_part, "%Y%j").date())

In [None]:
output_dir = "/home/hamid/mnt/nas/Hamid/GLASS/xr_files/"

In [None]:
# Function to process each file
def process_file(fname, date):
    da = rioxarray.open_rasterio(fname, chunks={"x": 1000, "y": 1000})
    da = da.squeeze().drop("band")  # Remove the band dimension if it exists
    da = da.expand_dims(time=[date])  # Add time dimension
    return da


# Create a list of delayed objects
delayed_arrays = [
    dask.delayed(process_file)(fname, date) for fname, date in zip(fnames, dates)
]

data_arrays = dask.compute(*delayed_arrays)
combined_da = xr.concat(data_arrays, dim="time")
combined_da.attrs["site_name"] = name
combined_da.attrs["latitude"] = lat
combined_da.attrs["longitude"] = lon
ds = combined_da.to_dataset(name="glass_lai")
chunks = {"time": 1, "y": 1000, "x": 1000}  # Adjust chunk sizes as needed
ds = ds.chunk(chunks)

In [2]:
920 / 60

15.333333333333334

In [None]:
ds["time"] = pd.to_datetime(ds["time"])

In [None]:
output_file = os.path.join(output_dir, f"{name}_GLASS_LAI_2002_2021.nc")
write_job = ds.to_netcdf(
    output_file,
    engine="netcdf4",
    encoding={
        "glass_lai": {"zlib": True, "complevel": 5},
        "time": {"units": "days since 1970-01-01"},
    },
    compute=False,  # This returns a Dask delayed object instead of computing immediately
)

# Execute the write operation with a progress bar
print("Writing to NetCDF file:")
with ProgressBar():
    write_job.compute()

In [None]:
ds.to_netcdf(
    output_file,
    engine="netcdf4",
    encoding={"glass_lai": {"zlib": True, "complevel": 5}},
    compute=True,
)

In [None]:
data_arrays = []

for fname, date in zip(fnames, dates):
    da = rioxarray.open_rasterio(fname)
    da = da.squeeze().drop("band")  # Remove the band dimension if it exists
    da = da.expand_dims(time=[date])  # Add time dimension
    data_arrays.append(da)
combined_da = xr.concat(data_arrays, dim="time")
combined_da = combined_da.assign_coords(
    lon=("x", np.linspace(lon - 0.5, lon + 0.5, combined_da.shape[-1])),
    lat=("y", np.linspace(lat - 0.5, lat + 0.5, combined_da.shape[-2])),
)
combined_da.attrs["site_name"] = name
combined_da.attrs["latitude"] = lat
combined_da.attrs["longitude"] = lon
ds = combined_da.to_dataset(name="glass_lai")