# Compile ET Monthly Data Sets

Below are the Python codes needed to download and process six ET datasets with monthly time steps. On my Dell Precision 3570 with a 12th Gen Intel i7-1255U 1.70 GHz processor and a 250Mb/s internet connection, each of these datasets takes less than 15 minutes to download and process, except for the WBET data set. This data set can take ~24 hours to fully process due to the high resolution and large file sizes. While the other datasets use at most 2GB of disk space each, the WBET data set can use ~100GB at peak usage and ~50GB for the final processed file. This can be reduced if the full date range is not utilized.

> NOTE: The links and files names on the web services are accurate as of 10/04/2024. It is possible that the location of the data has been updated, the web service has changed the path to the data set, or the files have been removed since this date. If you try to run these codes and they fail, it is recommended to follow the shared links and confirm the url and download methods have not changed.

## SSEBop MODIS Data Set

The SSEBop data can be accessed from the [USGS Early Warning and Environmental Monitoring Program](https://earlywarning.usgs.gov/ssebop) (via the [ftp site](https://edcintl.cr.usgs.gov/downloads/sciweb1/shared/uswem/web/conus/eta/modis_eta/monthly/downloads/)). The monthly data is stored in individual GeoTIFFs files that are then zipped for compression. We retrieve the zip files with [fsspec](https://filesystem-spec.readthedocs.io/en/latest/#). We selected to retrieve all of the monthly data for the available years of 2000-2022. The preprocessing steps included extracting the monthly GeoTIFFs from the zip files and combining them all into a single NetCDF file. This process had to be done separately for dates before and after July 2016 as dates after this have fewer lat/lon indices values than those before. To have a unified data set these date groups were processed by aligning and only keeping the lat/lon range common to both date groups. Additionally after July 2016, bodies of water have a fill value of 0 rather than NaN. We replace the 0 fill with NaN using a mask from before July 2016, as 0s can be present on land surfaces in the winter.

In [None]:
# SSEBop Dataset
import fsspec
import rioxarray
import xarray as xr
import pandas as pd
import zipfile
import io
import os
import numpy as np

fs = fsspec.filesystem("https", timeout=3600)
url = "https://edcintl.cr.usgs.gov/downloads/sciweb1/shared/uswem/web/conus/eta/modis_eta/monthly/downloads/"

# Monthly date range to use
dates = pd.date_range("2000-01-01", "2022-12-31", freq="MS")

# Make a directory for holding and extracting the zip files
if not os.path.isdir("../Data/ssebop"):
    os.makedirs("../Data/ssebop")

# Read the files to xarray and compile to monthly data
for date in dates:
    # Download the zipfiles
    # (Do this individually vs all files in a list as server may disconnect
    # downloading for list)
    fs.get(url + "m" + date.strftime("%Y%m") + ".zip", "../Data/ssebop/")

    # Access zip file without unzipping
    zfile = zipfile.ZipFile("../Data/ssebop/m" + date.strftime("%Y%m") + ".zip")

    # Select the GeoTIFF files from zip_file_list and extract
    geotiff_file = "m" + date.strftime("%Y%m") + ".modisSSEBopETactual.tif"
    zfile.extractall("../Data/ssebop/", [geotiff_file])

    # Delete downloaded zipfile as we no longer need it
    os.remove("../Data/ssebop/m" + date.strftime("%Y%m") + ".zip")

    # Read in GeoTIFF
    ds_monthly = rioxarray.open_rasterio(
        f"../Data/ssebop/{geotiff_file}",
        chunks={},
        band_as_variable=True,
    )

    # Rename coords to corresponding names. Assign the date to the Dataset
    ds_monthly = (
        ds_monthly.rename({"x": "lon", "y": "lat", "band_1": "aet"})
        .assign_coords(time=date)
        .expand_dims(dim="time")
    )

    # Save monthly compiled dataset
    ds_monthly.to_netcdf(
        path="../Data/ssebop/" + date.strftime("%Y-%m") + ".modisSSEBopETactual.nc",
        engine="netcdf4",
    )

    # Delete extracted geotiff as we no longer need it
    os.remove(f"../Data/ssebop/{geotiff_file}")

# Read in monthly netcdf Datasets into full Dataset.
# Dates after 2016-07 have less lat/lon indices values than those before.
# Read these date groups in seperately, slice, and align for concatenating
# into single file. Additionally after 2016-07, bodies of water have a fill
# value of 0 rather than NaN. Replace the 0 fill with NaN using a mask from
# before 2016-07 as 0 can be present on land surface in winter.
pre_201608 = pd.date_range("2001-01-01", "2016-7-31", freq="MS")
ds1 = xr.open_mfdataset(
    [
        "../Data/ssebop/" + date + ".modisSSEBopETactual.nc"
        for date in pre_201608.strftime("%Y-%m")
    ],
    engine="netcdf4",
)

post_201608 = pd.date_range("2016-08-01", "2022-12-31", freq="MS")
ds2 = xr.open_mfdataset(
    [
        "../Data/ssebop/" + date + ".modisSSEBopETactual.nc"
        for date in post_201608.strftime("%Y-%m")
    ],
    engine="netcdf4",
)

# reindex lat and lon to match, values are off by floating point rounding errors
# Match lat/lon range
ds1 = ds1.sel(lon=ds2["lon"], lat=ds2["lat"], method="nearest", tolerance=1e-10)
ds1, ds2 = xr.align(ds1, ds2, join="override", exclude="time")

# Concatenate, remove spatial_ref var, and convert dtype to float32
ds = xr.concat([ds1, ds2], dim="time")
ds = ds.drop_vars("spatial_ref")
ds = ds.astype("float32")

# Set 0 fill values to NaNs
ds = ds.where(~np.isnan(ds.aet.isel(time=0)))

# Add new metadata attributes
ds["aet"].attrs["description"] = (
    "Actual evaporation from SSEBop MODIS, monthly total"
)
ds["aet"].attrs["dimensions"] = "lon lat time"
ds["aet"].attrs["standard_name"] = "Actual evaporation"
ds["aet"].attrs["long_name"] = "Actual evaporation"
ds["aet"].attrs["units"] = "mm.month-1"

# Add some coordinate metadata attributes
ds["lat"].attrs["units"] = "degrees_north"
ds["lat"].attrs["description"] = "Latitude of the center of the grid cell"
ds["lat"].attrs["long_name"] = "latitude"
ds["lat"].attrs["standard_name"] = "latitude"
ds["lat"].attrs["axis"] = "Y"

ds["lon"].attrs["units"] = "degrees_east"
ds["lon"].attrs["description"] = "Longitude of the center of the grid cell"
ds["lon"].attrs["long_name"] = "longitude"
ds["lon"].attrs["standard_name"] = "longitude"
ds["lon"].attrs["axis"] = "X"

ds["time"].attrs["long_name"] = "time"
ds["time"].attrs["standard_name"] = "time"
ds["time"].attrs["description"] = (
    "Monthly time step indicated by the first day of the month."
)
ds["time"].attrs["unit"] = "month"
ds["time"].attrs["axis"] = "T"

# Create chunksizes from coords, with the time dim chunked
chunksizes = list(ds.coords.sizes.values())
chunksizes[list(ds.coords.sizes).index('time')] = 1

# Save full Dataset
ds.to_netcdf(
    path="../Data/ssebop/ssebop_aet.nc",
    format="NETCDF4",
    engine="netcdf4",
    # Chunk file and compress as is its pretty big uncompressed (~20 GB)
    encoding={"aet": {"zlib": True, "complevel": 4, 'chunksizes': chunksizes}},
)

# Remove intermediate monthly files
for date in dates.strftime("%Y-%m"):
    os.remove("../Data/ssebop/" + date + ".modisSSEBopETactual.nc")

ds = xr.open_dataset(
    "../Data/ssebop/ssebop_aet.nc",
    engine="netcdf4",
    chunks={},
)
ds

## GLEAM Data Set

The GLEAM v3.7b data can be accessed from at the [GLEAM website](https://www.gleam.eu/) by signing up for login access to their SFTP site. The monthly data is stored in a single NetCDF file which we retrieve with [fsspec](https://filesystem-spec.readthedocs.io/en/latest/#). We selected to retrieve all of the monthly data for the available years of 2003-2022. The preprocessing steps included (1) limiting the latitude and longitude to CONUS range as the file holds global data (i.e., limit latitudes from $24^\circ$ to $53^\circ$ and longitudes from $-126^\circ$ to $-66^\circ$); and (2) shifting the month date to be the first of the month vs the end.

> NOTE: You will need a [GLEAM Login](https://www.gleam.eu/#downloads) to access this data. Once you have an login, you can use it to access the SFTP site. This is done by setting your username, password, and port as the os enviromental variables (e.g., in your `~/.bashrc` file) of `GLEAM_USERNAME`, `GLEAM_PASSWORD`, and `GLEAM_PORT`, respectively.
>
> Additionally, since the time of gathering this data set, the GLEAM model has been updated to v4.1. As stated on the GLEAM webpage, "datasets are typically updated and extended once a year and are generally released around April. When a new version of a dataset is released, the older version becomes obsolete and is removed from the server. However, previous versions are still available upon request." Therefore, the v3.7b data is not accessible anymore on the GLEAM server and will have to be requested. Also, if one wants to use the newer 4.1b data set (currently unavailable as of 10/04/2024) the `filepath` below can be updated to the new version as it comes out.

In [None]:
# GLEAM v3.7b Dataset
import fsspec
import xarray as xr
import os
import pandas as pd

sftp_host = "sftp://hydras.ugent.be"
gleam_creds_sftp = dict(
    username=os.environ["GLEAM_USERNAME"],
    password=os.environ["GLEAM_PASSWORD"],
    port=int(os.environ["GLEAM_PORT"]),
)

if not os.path.isdir("../Data/gleam"):
    os.makedirs("../Data/gleam")

# Download the GLEAM data
# host input excludes the sftp prefix
fs = fsspec.filesystem("sftp", host=sftp_host[7:], **gleam_creds_sftp)

filepath = "/data/v3.7b/monthly/E_2003-2022_GLEAM_v3.7b_MO.nc"
path = sftp_host + filepath

fs.get(path, "../Data/gleam/")

# Open the file
ds = xr.open_dataset(
    f"../Data/gleam/{os.path.basename(filepath)}",
    engine="netcdf4",
    chunks={},
)

# Only keep CONUS range of data
ds = ds.sel(lat=slice(53, 24))
ds = ds.sel(lon=slice(-126, -66))

# Adjust month dates to be first of month rather than end of month for consistency
# with other datasets
ds = ds.reindex(
    {"time": ds.get_index("time").shift(periods=-1, freq="MS")}, method="backfill"
)

# Rename variable to common name and add new metadata attributes
ds = ds.rename({"E": "aet"})
ds["aet"].attrs["description"] = "Actual total evaporation from GLEAM 3.7b"
ds["aet"].attrs["long_name"] = "Actual evaporation"
ds["aet"].attrs["dimensions"] = "lon lat time"

# Add some coordinate metadata attributes
ds["lat"].attrs["units"] = "degrees_north"
ds["lat"].attrs["description"] = "Latitude of the center of the grid cell"
ds["lat"].attrs["long_name"] = "latitude"
ds["lat"].attrs["standard_name"] = "latitude"
ds["lat"].attrs["axis"] = "Y"

ds["lon"].attrs["units"] = "degrees_east"
ds["lon"].attrs["description"] = "Longitude of the center of the grid cell"
ds["lon"].attrs["long_name"] = "longitude"
ds["lon"].attrs["standard_name"] = "longitude"
ds["lon"].attrs["axis"] = "X"

ds["time"].attrs["long_name"] = "time"
ds["time"].attrs["standard_name"] = "time"
ds["time"].attrs["unit"] = "month"
ds["time"].attrs["description"] = (
    "Monthly time step indicated by the first day of the month."
)
ds["time"].attrs["axis"] = "T"

ds.to_netcdf(
    path="../Data/gleam/gleam_aet.nc",
    format="NETCDF4",
    engine="netcdf4",
    # Ensure no chunking as file is very small (~20 MB)
    encoding={"aet": {'chunksizes': list(ds.coords.sizes.values())}}
)

# Remove downloaded file to reduce storage, as the data is now in the new netcdf
os.remove("../Data/gleam/E_2003-2022_GLEAM_v3.7b_MO.nc")

ds = xr.open_dataset(
    "../Data/gleam/gleam_aet.nc",
    engine="netcdf4",
    chunks={},
)
ds

## ERA-5 Data Set

The ERA-5 data can be found at the [Climate Data Store](https://cds.climate.copernicus.eu/) (CDS). The monthly data is stored in a cloud hosted format and needs to be retrieved using the [CDS API](https://cds.climate.copernicus.eu/how-to-api). We selected to retrieve the mean monthly ET data for the years spanning 1950-2022. The preprocessing steps included (1) converting the data to positive values as negatives are used to indicate the inverse of precipitation; (2) setting any then negative values to zero; and (3) converting units from meters per day to millimeters per month.

> NOTE: You will need a CDS account to access this data. Once you have an account, make sure to [configure `cdsapi`](https://github.com/ecmwf/cdsapi#configure) for the download to work.

In [None]:
# ERA-5 Dataset
import xarray as xr
import cdsapi
import os
import zipfile

c = cdsapi.Client()

if not os.path.isdir("../Data/era5"):
    os.makedirs("../Data/era5")

# This command is generated from Climate Data Store
# (https://cds.climate.copernicus.eu/datasets/reanalysis-era5-land-monthly-means?tab=download)
c.retrieve(
    "reanalysis-era5-land-monthly-means",
    {
        "product_type": "monthly_averaged_reanalysis",
        "variable": "total_evaporation",
        "year": [
            "1950",
            "1951",
            "1952",
            "1953",
            "1954",
            "1955",
            "1956",
            "1957",
            "1958",
            "1959",
            "1960",
            "1961",
            "1962",
            "1963",
            "1964",
            "1965",
            "1966",
            "1967",
            "1968",
            "1969",
            "1970",
            "1971",
            "1972",
            "1973",
            "1974",
            "1975",
            "1976",
            "1977",
            "1978",
            "1979",
            "1980",
            "1981",
            "1982",
            "1983",
            "1984",
            "1985",
            "1986",
            "1987",
            "1988",
            "1989",
            "1990",
            "1991",
            "1992",
            "1993",
            "1994",
            "1995",
            "1996",
            "1997",
            "1998",
            "1999",
            "2000",
            "2001",
            "2002",
            "2003",
            "2004",
            "2005",
            "2006",
            "2007",
            "2008",
            "2009",
            "2010",
            "2011",
            "2012",
            "2013",
            "2014",
            "2015",
            "2016",
            "2017",
            "2018",
            "2019",
            "2020",
            "2021",
            "2022",
        ],
        "month": [
            "01",
            "02",
            "03",
            "04",
            "05",
            "06",
            "07",
            "08",
            "09",
            "10",
            "11",
            "12",
        ],
        "time": "00:00",
        "data_format": "netcdf",
        "area": [
            53,
            -126,
            24,
            -66,
        ],
    },
    "../Data/era5/era5.zip",
)

# Access zip file without unzipping
zfile = zipfile.ZipFile("../Data/era5/era5.zip")

# Select the GeoTIFF files from zip_file_list and extract
zfile.extractall("../Data/era5/", ['data_0.nc', 'data_1.nc'])

ds = xr.open_mfdataset(
    "../Data/era5/data_*.nc",
    engine="netcdf4",
    chunks={},
)

# Rename the coordinates to a common format and add some metadata attributes
ds = ds.rename({"valid_time": "time", "longitude": "lon",
                "latitude": "lat", "e": "aet"})
ds = ds.drop_vars(['number', 'expver'])

# Data values are negative to indicate inverse of precipitation (see docs) and
# in meters. We want to switch to positive values and mm, along with setting
# any then negative values to 0. Additionally, from the documentation, the monthly
# means have units that include "per day". We want "per month". So, we need to
# multiply each month by the number of days in it.
ds = -1e3 * ds
ds = ds.where(~(ds < 0), 0)
ds = ds * ds.get_index("time").days_in_month.values.reshape(
    len(ds.get_index("time")), 1, 1
)

# Add new metadata attributes
ds["aet"].attrs["units"] = "mm.month-1"
ds["aet"].attrs["description"] = (
    "Accumulated amount of water that has evaporated from the Earth's surface, "
    + "including a simplified representation of transpiration (from vegetation), "
    + "into vapour in the air above."
)
ds["aet"].attrs["long_name"] = "Total Evaporation"
ds["aet"].attrs["dimensions"] = "lon lat time"

ds["lat"].attrs["description"] = "Latitude of the center of the grid cell"
ds["lat"].attrs["standard_name"] = "latitude"
ds["lat"].attrs["axis"] = "Y"

ds["lon"].attrs["description"] = "Longitude of the center of the grid cell"
ds["lon"].attrs["standard_name"] = "longitude"
ds["lon"].attrs["axis"] = "X"

ds["time"].attrs["standard_name"] = "time"
ds["time"].attrs["unit"] = "month"
ds["time"].attrs["axis"] = "T"
ds["time"].attrs["description"] = (
    "Monthly time step indicated by the first day of the month."
)

# Convert to float32 (we do not need 64 bit precision) and save.
ds = ds.astype("float32")
ds.to_netcdf(
    path="../Data/era5/era5_aet.nc",
    engine="netcdf4",
    format="NETCDF4",
    # Ensure no chunking as file is small (~0.5 GB)
    encoding={"aet": {'chunksizes': list(ds.coords.sizes.values())}}
)

os.remove("../Data/era5/era5.zip")
os.remove("../Data/era5/data_0.nc")
os.remove("../Data/era5/data_1.nc")

ds = xr.open_dataset(
    "../Data/era5/era5_aet.nc",
    engine="netcdf4",
    chunks={},
)
ds

## NLDAS Data Set

The NLDAS data can be found at the [Goddard Earth Sciences Data and Information Services Center](https://daac.gsfc.nasa.gov/) (GESDISC). The monthly data is stored in individual NetCDF files which we retrieve with [fsspec](https://filesystem-spec.readthedocs.io/en/latest/#). We selected to retrieve all of the monthly data for the available years of 1979-2022. The preprocessing steps included (1) extracting the ET data from the collection of variables; (2) drop January of 1979 as it starts on the 2nd of the month vs the 1st like all other months; and (3) converting units from kilograms per meter squared per month to millimeters per month (really just a name change assuming a water density of 1 g.cm-3)

> NOTE: You will need a [EarthData Login](https://wiki.earthdata.nasa.gov/display/EL/How+To+Register+For+an+EarthData+Login+Profile) to access this data. Once you have an login, make sure to [link the login to the NASA GESDISC Data Archive](https://disc.gsfc.nasa.gov/earthdata-login) for the download to work. You will then need to set your username and password as the os enviromental variables (e.g., in your `~/.bashrc` file) of `NASA_EARTHDATA_USERNAME` and `NASA_EARTHDATA_PASSWORD`.


In [None]:
# NLDAS Dataset
import fsspec
import xarray as xr
import aiohttp
import os

if not os.path.isdir("../Data/nldas"):
    os.makedirs("../Data/nldas")

# If compiled netcdf is not made from downloads, make it
# Requires an account to access the data
# Username and Password are given as OS environmental variables
# (NASA_EARTHDATA_USERNAME and NASAS_EARTHDATA_PASSWORD)
fs = fsspec.filesystem(
    "https",
    timeout=3600,
    client_kwargs={
        "auth": aiohttp.BasicAuth(
            os.environ["NASA_EARTHDATA_USERNAME"],
            password=os.environ["NASA_EARTHDATA_PASSWORD"],
        )
    },
)
base_url = (
    "https://data.gesdisc.earthdata.nasa.gov/data/NLDAS/NLDAS_NOAH0125_M.2.0/"
)

months = range(1, 13)
years = range(1979, 2023)

# Make list of paths
paths = []
for year in years:
    for month in months:
        # Create full URL
        filepath = (
            f"{year}/NLDAS_NOAH0125_M.A{year}" + str(month).zfill(2) + ".020.nc"
        )
        paths.append(base_url + filepath)

fs.get(paths, "../Data/nldas/")

# Open first year to get list of variables to drop
ds = xr.open_dataset(
    "../Data/nldas/NLDAS_NOAH0125_M.A197901.020.nc",
    engine="netcdf4",
    chunks={}
)
drop_vars = [var for var in list(ds.data_vars) if var != "Evap"]

# Open all files and combine. Use one chunk as file is only 200MB total
ds = xr.open_mfdataset(
    [
        "../Data/nldas/"
        + f"NLDAS_NOAH0125_M.A{year}"
        + str(month).zfill(2)
        + ".020.nc"
        for year in years
        for month in months
    ],
    drop_variables=drop_vars,
    engine="netcdf4",
    chunks={"lat": -1, "lon": -1, "time": -1},
)

# Drop January of 1979 as it starts on the 2nd. See NLDAS docs for details.
ds = ds.where(ds.time != ds.time[0], drop=True)

# Rename variable and coords to common names and add attributes
ds = ds.rename({"Evap": "aet"})

# Units are in kg.m-2.month-1, which is equivalent to mm.month-1 assuming a water
# density of 1g.cm-3 (mm = kg.m-2 / g.cm-3 * 1e3g.kg-1 * 1e-6m3.cm-3 * 1e3mm.m-1)
ds["aet"].attrs["units"] = "mm.month-1"
ds["aet"].attrs["description"] = "Actual Total Evapotranspiration"
ds["aet"].attrs["dimensions"] = "lon lat time"

# Add some metadata attributes
ds["lat"].attrs["description"] = "Latitude of the center of the grid cell"
ds["lat"].attrs["axis"] = "Y"

ds["lon"].attrs["description"] = "Longitude of the center of the grid cell"
ds["lon"].attrs["axis"] = "X"

ds["time"].attrs["standard_name"] = "time"
ds["time"].attrs["unit"] = "month"
ds["time"].attrs["axis"] = "T"
ds["time"].attrs["description"] = (
    "Monthly time step indicated by the first day of the month."
)
del ds["time"].attrs["begin_date"]
del ds["time"].attrs["begin_time"]
del ds["time"].attrs["end_date"]
del ds["time"].attrs["end_time"]
del ds["time"].attrs["bounds"]

# Save dataset to netcdf
ds.to_netcdf(
    path="../Data/nldas/nldas_aet.nc",
    format="NETCDF4",
    engine="netcdf4",
    # Ensure no chunking as file is small (~0.2 GB)
    encoding={"aet": {'chunksizes': list(ds.coords.sizes.values())}}
)

# Remove downloaded files to reduce storage, as the data is now in the combined netcdf
for year in years:
    for month in months:
        os.remove(
            "../Data/nldas/"
            + f"NLDAS_NOAH0125_M.A{year}"
            + str(month).zfill(2)
            + ".020.nc"
        )

ds = xr.open_dataset(
    "../Data/nldas/nldas_aet.nc",
    engine="netcdf4",
    chunks={},
)
ds

## TerraClimate Data Set

The TerraClimate data can be found at the [northestknowledge.net](https://climate.northwestknowledge.net/) via the [Climatology Lab](https://www.climatologylab.org/terraclimate.html). The monthly data is stored in individual NetCDF files for each year spanning 1958-2022, which we retrieve with [fsspec](https://filesystem-spec.readthedocs.io/en/latest/#). The preprocessing steps included limiting the latitude and longitude to CONUS range as the files hold global data (i.e., limit latitudes from $24^\circ$ to $53^\circ$ and longitudes from $-126^\circ$ to $-66^\circ$). No other preprocessing is needed as the units are already in millimeters per month.

In [None]:
# TerraClimate Dataset
import fsspec
import xarray as xr
import os

if not os.path.isdir("../Data/terraclimate"):
    os.makedirs("../Data/terraclimate")

# If compiled netcdf is not made from downloads, make it
fs = fsspec.filesystem("https", timeout=3600)
url = "https://climate.northwestknowledge.net/TERRACLIMATE-DATA/"

years = range(1958, 2023)

# Download all of the individual year files. Do this recursively as fs may timeout if
# a full list of files is called at once
paths = []
for year in years:
    file = f"TerraClimate_aet_{year}.nc"
    # Create full URL
    paths.append(url + file)

fs.get(paths, "../Data/terraclimate/")

# Open the files and combine
ds = xr.open_mfdataset(
    ["../Data/terraclimate/" + f"TerraClimate_aet_{year}.nc" for year in years],
    engine="netcdf4",
    chunks={'lat': -1, 'lon': -1, 'time': 12},
)

# Only keep CONUS range of data
ds = ds.sel(lat=slice(53, 24))
ds = ds.sel(lon=slice(-126, -66))

# Remove crs index
ds = ds.drop_vars("crs")

# Replace unicode characters in summary (degree symbol)
ds.attrs["summary"] = ds.attrs["summary"].replace(
    ds.attrs["summary"][64:66], " deg"
)

# Update aet units to include time span
ds["aet"].attrs["units"] = "mm.month-1"
ds["aet"].attrs["long_name"] = "Total Actual Evapotranspiration"

ds["time"].attrs["unit"] = "month"
ds["time"].attrs["description"] = (
    "Monthly time step indicated by the first day of the month."
)

# Convert dtype to float32 as we do not need 64 bit precision, and it reduces
# the size by half
ds = ds.astype("float32")

# Create chunksizes from coords, with the time dim chunked
chunksizes = list(ds.coords.sizes.values())
chunksizes[list(ds.coords.sizes).index('time')] = 1

# Save xarray dataset to netcdf
_ = ds.to_netcdf(
    path="../Data/terraclimate/terraclimate_aet.nc",
    engine="netcdf4",
    format="NETCDF4",
    # Chunk file and compress as is its pretty big uncompressed (~3 GB)
    encoding={"aet": {"zlib": True, "complevel": 4, 'chunksizes': chunksizes}},
)

# Remove downloaded files to reduce storage, as the data is now in the combined netcdf
for year in years:
    os.remove("../Data/terraclimate/" + f"TerraClimate_aet_{year}.nc")

ds = xr.open_dataset(
    "../Data/terraclimate/terraclimate_aet.nc",
    engine="netcdf4",
    chunks={},
)
ds

## WBET (Rietz et al. 2023) Data Set

The WBET data can be accessed from [ScienceBase](https://www.sciencebase.gov/catalog/item/64135576d34eb496d1ce3d2e). The monthly data is stored in individual GeoTIFFs files that are then zipped within decadal directories spanning years 1895-2018. We retrieve the zip files uing with [sciencebasepy](https://github.com/DOI-USGS/sciencebasepy). We selected to retrieve all of the monthly data for the available years. The preprocessing steps included (1) extracting the monthly GeoTIFFs from the zip files and combining them all into a single NetCDF file; and (2) converting the units from mm per day to mm per month.

> NOTE: You will need a [SciencBase Account](https://www.sciencebase.gov/directory/newUser/create) to download this data via the Python code, since ScienceBase requires and account to access cloud hosted data. You will then need to set your username and password as the os enviromental variables (e.g., in your `~/.bashrc` file) of `SCIENCEBASE_USERNAME` and `SCIENCEBASE_PASSWORD`. If you don't have an account and do not want to create one, you will have to manually download [the data](https://www.sciencebase.gov/catalog/item/64135576d34eb496d1ce3d2e).

In [None]:
import sciencebasepy
import os
import re
import rioxarray
import xarray as xr
import pandas as pd
import zipfile_deflate64 as zipfile
import io

if not os.path.isdir("../Data/wbet"):
    os.makedirs("../Data/wbet")

# Establish a session.
sb = sciencebasepy.SbSession()

# Login required to access cloud files via sciencebasepy
sb.login(os.environ["SCIENCEBASE_USERNAME"], os.environ["SCIENCEBASE_PASSWORD"])

years = range(1896, 2019)

# Get list of files for monthly ET data
file_list = sb.get_item_file_info(sb.get_item("64135576d34eb496d1ce3d2e"))
filenames = [
    i["name"]
    for i in file_list
    if re.search("ET.*_monthly.zip", i["name"]) is not None
]

file_date_ranges = [re.findall(r'\d+', filename) for filename in filenames]
filenames = [name for name, (low, high) in zip(filenames, file_date_ranges)
             if (int(low) >= min(years)) and (int(high) <= max(years))]

# Download the files (these files are big (6GB a piece), so this will take a while...)
_ = sb.download_cloud_files(
    filenames,
    sb.generate_S3_download_links("64135576d34eb496d1ce3d2e", filenames),
    "wbet",
)

# Open the GeoTIFF files to xarray
ds_monthly_list = []
for zippedfiles in filenames:
    # Access zip file without unzipping
    zfile = zipfile.ZipFile("../Data/wbet/" + zippedfiles)
    zip_file_list = zfile.namelist()

    # Select the GeoTIFF files from zip_file_list and extract
    gtif_files = [
        file for file in zip_file_list if re.search(".*(\.tif)$", file) is not None
    ]
    zfile.extractall("../Data/wbet/", gtif_files)

    # Delete downloaded file to save disk space, since files are now extracted
    os.remove("../Data/wbet/" + zippedfiles)

    for gtif in gtif_files:
        # Read in each extracted GeoTIFF
        ds_month = rioxarray.open_rasterio(
            "../Data/wbet/" + gtif, chunks={}, band_as_variable=True
        )

        # Remove spatial_ref coord and rename coords to corresponding names.
        # Assign the date to the Dataset 5th-8th characters of file indicate
        # year, 10th-11th indicate month (index start at 0, characters at 1)
        year, month = gtif[4:8], gtif[9:11]
        date = year + "-" + month
        ds_month = (
            ds_month.rename({"x": "lon", "y": "lat", "band_1": "aet"})
            .assign_coords(time=pd.to_datetime(date))
            .expand_dims(dim="time")
        )
        ds_month = ds_month.drop_vars("spatial_ref")

        # Stack the monthly Datasets to list for concatenating
        ds_monthly_list.append(ds_month)
        ds_month.close()

    # Concatenate and save to netcdf
    ds = xr.concat(ds_monthly_list, dim="time")
    ds.to_netcdf(
        path="../Data/wbet/" + zippedfiles[:-4] + ".nc",
        format="NETCDF4",
        engine="netcdf4",
        encoding={"aet": {"zlib": True, "complevel": 4}},
    )

    # Delete extracted files to save disk space, since files are now
    # compiled to netcdf
    for gtif in gtif_files:
        os.remove("../Data/wbet/" + gtif)

    # Reset variables
    del ds
    del ds_monthly_list
    ds_monthly_list = []

# Open processed decade files as single dataset
ds = xr.open_mfdataset(
    ["../Data/wbet/" + file[:-4] + ".nc" for file in filenames],
    engine="netcdf4",
    chunks={"lat": -1, "lon": -1, "time": 2},
)

# From the metadata xml file, the monthly data are in units of mm.day-1.
# We want mm.month-1. So, we need to multiply each month by the number of days in it.
ds = ds * ds.get_index("time").days_in_month.values.reshape(
    len(ds.get_index("time")), 1, 1
)

# Add new metadata attributes to variable and coordinates
ds["aet"].attrs["unit"] = "mm.month-1"
ds["aet"].attrs["description"] = (
    "Actual Total Evapotranspiration via WBET from Reitz+2023"
)
ds["aet"].attrs["long_name"] = "Actual Evapotranspiration"
ds["aet"].attrs["standard_name"] = "Actual Evapotranspiration"
ds["aet"].attrs["dimensions"] = "lon lat time"

ds["lat"].attrs["units"] = "degrees_north"
ds["lat"].attrs["description"] = "Latitude of the center of the grid cell"
ds["lat"].attrs["long_name"] = "latitude"
ds["lat"].attrs["standard_name"] = "latitude"
ds["lat"].attrs["axis"] = "Y"

ds["lon"].attrs["units"] = "degrees_east"
ds["lon"].attrs["description"] = "Longitude of the center of the grid cell"
ds["lon"].attrs["long_name"] = "longitude"
ds["lon"].attrs["standard_name"] = "longitude"
ds["lon"].attrs["axis"] = "X"

ds["time"].attrs["long_name"] = "time"
ds["time"].attrs["standard_name"] = "time"
ds["time"].attrs["unit"] = "month"
ds["time"].attrs["description"] = (
    "Monthly time step indicated by the first day of the month."
)
ds["time"].attrs["axis"] = "T"

# Convert dtype to float32 as we do not need 64 bit precision, and it reduces
# the size by half
ds = ds.astype("float32")

# Create chunksizes from coords, with the time dim chunked
chunksizes = list(ds.coords.sizes.values())
chunksizes[list(ds.coords.sizes).index('time')] = 1

# Save dataset and remove processed files
ds.to_netcdf(
    path="../Data/wbet/wbet_aet.nc",
    format="NETCDF4",
    engine="netcdf4",    
    # Chunk file and compress as it is big uncompressed (~130 GB)
    encoding={"aet": {"zlib": True, "complevel": 9, 'chunksizes': chunksizes}},
)

for file in filenames:
    os.remove("../Data/wbet/" + file[:-4] + ".nc")

# Open the saved netcdf
ds = xr.open_dataset(
    "../Data/wbet/wbet_aet.nc",
    engine="netcdf4",
    chunks={},
)
ds