In [1]:
import glob
from datetime import datetime
from datetime import timedelta
import numpy as np
import pandas as pd
import xarray as xr
import multiprocessing as mp

def load_and_organize_precip_file(path):
    dataset_temp = xr.open_dataset(path)
    dataset_temp = dataset_temp.sortby('latitude')
    dataset_temp = dataset_temp.MTPR
    min_lon, max_lon, min_lat, max_lat = (180, 330, 20, 80)
    dataset_temp = dataset_temp.sel(
                longitude=slice(min_lon, max_lon), 
                latitude=slice(min_lat, max_lat)
            )
    dataset_temp = dataset_temp.stack(time=('forecast_initial_time', 'forecast_hour'))
    dates = pd.to_datetime(np.array([dataset_temp.time.values[i][0] + pd.to_timedelta(dataset_temp.time.values[i][1], unit='h')\
                    for i in range(len(dataset_temp.time.values))]))
    data = xr.DataArray(
        dataset_temp.transpose('time','latitude','longitude').values,
        coords={'time': dates, 'lat': dataset_temp.latitude.values, 'lon': dataset_temp.longitude.values},
        dims=['time', 'lat', 'lon'],
        attrs=dataset_temp.attrs
    )
    return data

def process_and_save(file):
    data_temp = load_and_organize_precip_file(file)
    data_temp = data_temp.resample(time='1D').mean()
    data_temp = data_temp.to_dataset(name='PrecipitationRate')
    output_path = f"{path_out}PRECIP_{file.split('.')[-2]}.nc"
    data_temp.to_netcdf(output_path)
    return output_path

In [4]:
# Base path where the files are stored
path_era5 = '/glade/campaign/collections/rda/data/d633000/e5.oper.fc.sfc.meanflux/'

# List to store all the file paths
file_paths = []

# Loop over the years from 1940 to 2023
for year in range(1981, 2021):
    pattern = f'{path_era5}{year}*/e5.oper.fc.sfc.meanflux.235_055_mtpr.ll025sc.*.nc'
    matched_files = glob.glob(pattern)
    file_paths.extend(matched_files)

file_paths = np.sort(file_paths)

In [5]:
path_out = '/glade/derecho/scratch/jhayron/Data4Predictability/PRECIP_ERA5_Daily/'

# Use multiprocessing Pool
if __name__ == '__main__':
    with mp.Pool(mp.cpu_count()) as pool:
        results = pool.map(process_and_save, file_paths)

    print("Processing complete.")

Processing complete.
