In [1]:
import xarray as xr
import rioxarray as rioxr

import geopandas as geopd
import pandas as pd

import numpy as np

import os
import glob
import time

from multiprocessing import Pool
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 300

In [None]:
# This script processes and concatenates snow water equivalent (SWE) data for catchments.
# It reads SWE data from NetCDF files, clips the data to the catchment geometries, and calculates the mean SWE for each catchment.
# The results are saved as a CSV file.

# Path to the vector watershed file
vector_watershed_path = "/path/to/CAMELS-FI_catchments.gpkg"
watersheds = geopd.read_file(vector_watershed_path, layer='v1')

# Reprojecting to WGS84 so there's no need to reproject within loop
watersheds = watersheds.to_crs(epsg=4326)

In [5]:
def swe_catchment_mean(args):
    file, watersheds = args
    swe = pd.DataFrame(columns=watersheds.Paikka_Id, index=pd.to_datetime([]))
    swe.index.name = 'date'
    
    with xr.open_dataset(file, mask_and_scale=True, decode_coords='all') as dataset:
        data_array = dataset['swe']
        time = pd.to_datetime(data_array.time.item())
        data_array = data_array.loc[:, 59:71, 20:32]
        data_array = xr.where(data_array < 0, np.nan, data_array)
        data_array = data_array.interpolate_na('lon', limit=3, max_gap=1)
        data_array = data_array.interpolate_na('lat', limit=2, max_gap=1)
        data_array = data_array.rio.write_crs("epsg:4326")
        data_array = data_array.rename(lon='longitude', lat='latitude')
        
        swe_list = []
        for i in range(len(watersheds)):
            watershed = watersheds.loc[[i]]
            mean_swe = data_array.rio.clip(watershed.geometry.values, crs=watershed.crs, all_touched=True).mean().item()
            swe_list.append(round(mean_swe, 2))
            
    swe.loc[time] = swe_list
    return swe

In [6]:
# Root directory containing SWE data NetCDF files
root = "/path/to/snow/data/swe/MERGED/v3.1"
glob_root = root + '/**/**/*.nc'
files = {file for file in glob.iglob(glob_root, recursive=True)}
files = list(files)
assert len(files) > 0, f"No files found with glob {glob_root}, check path"

# Avoiding opening the same file multiple times by doing all the same operations with the same file open
arg_list = []
for file in files:
    counter += 1
    arg_list.append((file, watersheds))

# Because of a memory leak, these need to be divided into chunks
chunks = []
chunk_size = 108
for i in range(0, len(arg_list), chunk_size):
    chunks.append(arg_list[i:i + chunk_size])

In [None]:
swe_list = []
for chunk in tqdm(chunks):
    with Pool(18) as p:
        swe_list.append(p.map(swe_catchment_mean, chunk))

swe_list_flat = []
for chunk in swe_list:
    swe_list_flat.extend(chunk)

swe = pd.concat(swe_list_flat)
swe = swe.sort_index()
swe.to_csv("/path/to/timeseries_by_attribute/swe.csv")

In [None]:
# Display the SWE data table
swe