In [7]:
import os
import xarray as xr
import xesmf as xe
import pickle
from datetime import datetime

In [8]:
# Directory containing netCDF files for specific year
input_dir = '/Users/lilydonaldson/Downloads/examples/data/IMERG/IMERG_subdaily_raw/jan2020'
# Regrid file path
regrid_file_path = '/Users/lilydonaldson/Downloads/examples/regrid_files/regrid_2x2-5.nc'
# Output directory for year folders
output_dir = '/Users/lilydonaldson/Downloads/examples/data/IMERG/IMERG_regrid_xESMF'

In [9]:
# Function to regrid a dataset
def regrid_dataset(original_dataset, regrid_file_path):
    regrid_dataset = xr.open_dataset(regrid_file_path)
    target_grid = {
        'lat': regrid_dataset['lat'],
        'lon': regrid_dataset['lon']
    }
    regridder = xe.Regridder(original_dataset, target_grid, method='bilinear')
    return regridder(original_dataset)

# Function to extract precipitation values and group by month
def extract_precipitation_and_save(dataset, month_dict, year):
    time_str = str(dataset['time'].values[0])
    date = datetime.strptime(time_str[:10], '%Y-%m-%d')
    month = date.month
    precipitation_values = dataset['precipitation'].values
    if month not in month_dict[year]:
        month_dict[year][month] = []
    month_dict[year][month].extend(precipitation_values.flatten())

In [10]:
# Main processing
def process_files(input_dir, regrid_file_path, output_dir):
    month_dict = {}
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.nc4'):
            file_path = os.path.join(input_dir, file_name)
            original_dataset = xr.open_dataset(file_path)
            regridded_dataset = regrid_dataset(original_dataset, regrid_file_path)
            
            # Extract year from file name (assuming the format given)
            date_str = file_name.split('.')[4]
            year = datetime.strptime(date_str[:8], '%Y%m%d').year
            if year not in month_dict:
                month_dict[year] = {i: [] for i in range(1, 13)}
            
            extract_precipitation_and_save(regridded_dataset, month_dict, year)

    # Save each month's data to a .pkl file in the respective year folder
    for year, months in month_dict.items():
        year_folder = os.path.join(output_dir, str(year))
        os.makedirs(year_folder, exist_ok=True)
        for month, values in months.items():
            file_path = os.path.join(year_folder, f'precipitation_{year}_month_{month}.pkl')
            with open(file_path, 'wb') as f:
                pickle.dump(values, f)

In [None]:
# Run the processing
process_files(input_dir, regrid_file_path, output_dir)



In [2]:
import os
import xarray as xr
import xesmf as xe
from datetime import datetime
import re

# Function to regrid a dataset
def regrid_dataset(original_dataset, regrid_file_path):
    regrid_dataset = xr.open_dataset(regrid_file_path)
    target_grid = {
        'lat': regrid_dataset['lat'],
        'lon': regrid_dataset['lon']
    }
    regridder = xe.Regridder(original_dataset, target_grid, method='bilinear')
    return regridder(original_dataset)

# Function to extract date and time from the filename
def extract_date_time(filename):
    pattern = r'\.(\d{8})-S(\d{6})'
    match = re.search(pattern, filename)
    if match:
        date_str = match.group(1)
        time_str = match.group(2)
        return date_str, time_str
    else:
        raise ValueError(f"Filename {filename} does not match the expected pattern.")

# Main processing
def process_files(input_dir, regrid_file_path, output_dir):
    # Get sorted list of all NetCDF files
    file_names = sorted([f for f in os.listdir(input_dir) if f.endswith('.nc4')])

    current_month = None
    datasets_to_combine = []

    for file_name in file_names:
        file_path = os.path.join(input_dir, file_name)
        original_dataset = xr.open_dataset(file_path)
        regridded_dataset = regrid_dataset(original_dataset, regrid_file_path)

        # Convert precipitation values from mm/hour to mm/day
        regridded_dataset['precipitation'] = regridded_dataset['precipitation'] * 24
        
        # Extract date and time from the filename
        date_str, time_str = extract_date_time(file_name)
        date_time = datetime.strptime(date_str + time_str, '%Y%m%d%H%M%S')
        
        # Update the time coordinate
        regridded_dataset = regridded_dataset.assign_coords(time=("time", [date_time]))
        
        # Get the month
        month = date_time.strftime('%Y-%m')
        
        if current_month is None:
            current_month = month
        
        if month != current_month:
            # Save the current combined dataset for the month
            combined_dataset = xr.concat(datasets_to_combine, dim='time')
            output_filename = f"combined_{current_month}.nc"
            output_path = os.path.join(output_dir, output_filename)
            combined_dataset.to_netcdf(output_path)
            print(f"Saved {output_path}")
            
            # Reset for the new month
            datasets_to_combine = []
            current_month = month
        
        # Append the current dataset
        datasets_to_combine.append(regridded_dataset)
    
    # Save the remaining datasets for the last month
    if datasets_to_combine:
        combined_dataset = xr.concat(datasets_to_combine, dim='time')
        output_filename = f"xcombined_{current_month}.nc"
        output_path = os.path.join(output_dir, output_filename)
        combined_dataset.to_netcdf(output_path)
        print(f"Saved {output_path}")
        
# Directory containing netCDF files for specific year
input_dir = '/Users/lilydonaldson/Downloads/examples/data/IMERG/IMERG_subdaily_raw/jan2020'
# Regrid file path
regrid_file_path = '/Users/lilydonaldson/Downloads/examples/regrid_files/regrid_2x2-5.nc'
# Output directory for year folders
output_dir = '/Users/lilydonaldson/Downloads/examples/data/IMERG/IMERG_regrid_xESMF'
# Example usage
process_files(input_dir, regrid_file_path, output_dir)




Saved /Users/lilydonaldson/Downloads/examples/data/IMERG/IMERG_regrid_xESMF/xcombined_2020-01.nc


  int_num = np.asarray(num, dtype=np.int64)
