This notebook is a demo for downloading BIL files of PRISM daily data and converting these to a .zarr dataset. This demo particulary uses the `pyPRISMClimate` package to interface with the PRISM V2 web server. [pyPRISMClimate](https://sdtaylor.github.io/pyPRISMClimate/) handles the downloading and unzipping of zipped files and has multiple methods depending on the needed data.

Next, the daily BIL files are lazily read into an `xarray` dataset by seting a `chunk={}` parameter using `rioxarray.open_dataset()`. After processing, the xarray dataset is written to zarr locally. 

Finally, any downloaded files are cleaned up and removed.

In [None]:
import os
from pyPRISMClimate import get_prism_dailys
from pyPRISMClimate import get_prism_daily_single
import rioxarray
import xarray as xr
import pandas as pd
import hvplot.xarray
from glob import glob
from dask.distributed import Client, LocalCluster
import dask
from time import sleep
from typing import Union
from psutil import cpu_count
from multiprocessing import cpu_count
from multiprocessing.pool import ThreadPool
from functools import partial

In [None]:
# create local dask client
cluster = LocalCluster()
client = Client(cluster)

# number of cpu cores
num_cores = cpu_count(logical=False)

# number of workers
num_workers = num_cores - 1
cluster.scale(minimum=1, maximum=num_workers)
cluster

Testing parallel daily singles

In [None]:
# # function to create dask.delayed tasks of get_prism_daily_single
# def get_prism_daily_single_delayed(var, date, dest_path):
#     return dask.delayed(get_prism_daily_single)(var, date, dest_path)
    
# create list of dates for all days in January 1990
d_list = pd.date_range(start='1990-01-01', end='1990-01-31').tolist()
var = "ppt"
dest_path = "./download/"

# create list of dask.delayed tasks
download_dask_list = [dask.delayed(get_prism_daily_single)(var, date, dest_path) for date in d_list]

In [None]:
test_dask = dask.delayed(get_prism_daily_single)("ppt", "1999-01-02")
test_dask.compute()

In [None]:
dask.compute(*download_dask_list)

Test multiprocessing

In [None]:
# wrap function so date is first argument
def get_prism_daily_single_wrapper(date, variable, destin_path):

    d = date
    v = variable
    p = destin_path
    return get_prism_daily_single(v, d, p)


def get_prism_daily_single_multiprocessing(var, date_list, dest_path):
    # get cpu count
    num_cores = cpu_count()
    with ThreadPool(num_cores-1) as pool:
        try:
            func = partial(get_prism_daily_single_wrapper, variable=var, destin_path=dest_path)
            pool.imap(func, date_list)
        except Exception as e:
            print("did not work")
            pass

In [None]:
# d_list = pd.date_range(start='1990-01-01', end='1990-01-31').tolist()
d_list = [d.strftime('%Y-%m-%d') for d in pd.date_range(start='1990-01-01', end='1990-01-31')]
var = "ppt"
dest_path = "./download/"

get_prism_daily_single_multiprocessing(var, d_list, dest_path)

Functions to perform processing

In [None]:
def cleanup_downloads(year: int = None):
    """Cleanup downloads

    Args:
        year (int): year to cleanup
    """

    if type(year) == int:
        # get list of files with year in name
        files = glob(f"./download/*{str(year)}*")

    else:
        # get list of files
        files = glob("./download/*")

    # iterate through files and delete
    for file in files:
        # check if file exists
        if os.path.exists(file):
            # try to remove file 10 times
            for i in range(10):
                try:
                    os.remove(file)
                    break
                except:
                    sleep(1)
                    continue

def process_PRISM_dailies_byYear(var: str, year: Union[int, str], down_dir: str, zarr_dir: str, alt_zarr_name: str = None):
    """
    Process PRISM daily data for a given year

    Args:
        var (str): variable to download
        year (int): year to process
        down_dir (str): download directory
        zarr_dir (str): zarr directory
        alt_zarr_name (str): alternate name for zarr file if not variable name
    """

    # check if any files in down_dir
    if len(glob(down_dir + "*")) > 0:
        # delete files if any
        cleanup_downloads()

    # convert year to string
    if type(year) == int:
        year_str = str(year)

    else:
        year_str = year

    # misc vars
    dest_path = down_dir
    prism_var = var


    # set time range variables
    prism_start = year_str + '-01-01'
    prism_end = year_str + '-12-31'
    
    # create range of dates
    prism_date = pd.date_range(prism_start, prism_end)

    # Get var for 1/1-12/31/YYYY
    get_prism_dailys(prism_var, prism_start, prism_end, dest_path=dest_path)

    # bil path
    bil_path = dest_path + "*_" + year_str + "*.bil"

    # get list of bil files and open with Dask
    pr_da_list = []

    bil_files_list = glob(bil_path)
    
    for file in bil_files_list:
        with rioxarray.open_rasterio(file, chunks="auto") as f:
            pr_da_list.append(f)
            f.close()

    #create a list to hold the datasets
    pr_ds_list = []

    # add time dimension to each dataset in pr and convert attributes to data variables
    for i in range(len(prism_date)):
        # get single day
        day = pd.date_range(prism_date[i], periods=1)

        # convert to DataArray
        time_da = xr.DataArray(day, [('time', day)])

        # expand dims
        pr_da_list[i] = pr_da_list[i].expand_dims(time=time_da)

        # add name as str(i)
        pr_da_list[i].name = var

        # squeeze band dimension
        pr_da_list[i] = pr_da_list[i].squeeze("band", drop=True)

        # convert to dataset
        pr_ds_list.append(pr_da_list[i].to_dataset())

    # convert to dataset
    pr_ds = xr.concat(pr_ds_list, dim='time', combine_attrs='drop')

    # create list of attrs from pr_da_list[0]
    attrs_list = list(pr_da_list[0].attrs.keys())[-3:]

    # create dict of attrs
    attrs = dict((k, pr_da_list[0].attrs[k]) for k in attrs_list if k in attrs_list)

    # add attrs to pr_ds
    pr_ds.attrs = attrs

    # create chunk dict
    # sets time to be monthly chunks step has its own chunk
    chunk_dict = {'time': pr_ds.dims['time'], 'x': 281, 'y': 207}

    # rechunk
    pr_ds_rechunk = pr_ds.chunk(chunk_dict)

    # if alt_zarr_name is true then use that as zarr name
    if alt_zarr_name:
        # zarr store
        zarr_path = zarr_dir + alt_zarr_name + ".zarr"

    else:
        # zarr store
        zarr_path = zarr_dir + var + ".zarr"

    # check if file exists
    if os.path.exists(zarr_path):
        pr_ds_rechunk.to_zarr(zarr_path, mode='a', append_dim='time')
    else:
        pr_ds_rechunk.to_zarr(zarr_path)


def process_PRISM_dailies(var: str, year: Union[int, list], down_dir: str, zarr_dir: str, alt_zarr_name: str = None, parallel: bool = False):
    """Process PRISM daily data for a given year

    Args:
        var (str): variable to download
        year (Union[int, list]): year or list of years to process
        down_dir (str): download directory
        zarr_dir (str): zarr directory
        alt_zarr_name (str): alternate name for zarr file if not variable name
        parallel (bool): if true then use dask to process data
    """
    # check if year is int and dask is true then raise error
    if type(year) != list and parallel == True:
        raise ValueError("dask must be False if year is int or str")

    if type(year) == int or type(year) == str:

        # convert year str if int
        if type(year) == int:
            year = str(year)
        
        # download data and create zarr
        process_PRISM_dailies_byYear(var, year, down_dir, zarr_dir, alt_zarr_name)

        # cleanup downloads
        cleanup_downloads(year)

    # means year is list
    else:

        # pluralize year variable name if list
        years = year

        # check if dask is true
        if parallel == True:

            # create list of dask objects
            process_list_dask = [dask.delayed(process_PRISM_dailies_byYear)(
                var, yr, down_dir, zarr_dir, alt_zarr_name) for yr in years]
            
            # compute list
            dask.compute(process_list_dask)

            # cleanup downloads
            for yr in years:
                cleanup_downloads(yr)

        else:

            # process each year
            for yr in years:

                # download data and create zarr
                process_PRISM_dailies_byYear(var, yr, down_dir, zarr_dir, alt_zarr_name)

                cleanup_downloads(yr)

In [None]:
# single year
process_PRISM_dailies('ppt', 1985, './download/', './zarr/')

In [None]:
# two years w/ dask
process_PRISM_dailies('ppt', [1986, 1987], './download/', './zarr/', parallel=True)

In [None]:
# four years w/ dask
process_PRISM_dailies('ppt', [1988, 1989, 1990, 1991], './download/', './zarr/', parallel=True)

Test opening zarr

In [None]:
ppt_zarr = xr.open_zarr('./zarr/ppt.zarr')
ppt_zarr

In [None]:
ppt_zarr.hvplot(x='x', y='y', rasterize=True, cmap='viridis')

In [None]:
# shut down cluster and client
client.close()
cluster.close()