In [3]:
import numpy as np
import pandas as pd
import netCDF4 as nc
import matplotlib.pyplot as plt
import xarray as xr
import re
from datetime import datetime
import glob
import os

In [4]:
test = xr.open_dataset(r"C:\Users\lv299\OneDrive\A_Melbourne-Uni\A_Weather_for_21st_Century_RA_Internship\Local_Remote_Influences_on_Coastal_Rainfall\Data_preparation\TRMM\3B42_Daily.19980102.7.nc4")

In [5]:
test

In [6]:
# test_all = xr.open_mfdataset(r"C:\Users\lv299\OneDrive\A_Melbourne-Uni\A_Weather_for_21st_Century_RA_Internship\Local_Remote_Influences_on_Coastal_Rainfall\Data_preparation\TRMM\*.nc4", parallel=True)
# test_all

In [7]:
data_dir = r"C:\Users\lv299\OneDrive\A_Melbourne-Uni\A_Weather_for_21st_Century_RA_Internship\Local_Remote_Influences_on_Coastal_Rainfall\Data_preparation\TRMM"  # Update this path

# Pattern to match your nc files
file_pattern = '3B42_Daily.*.7.nc4'

# Get a list of all files matching the pattern
file_list = sorted(glob.glob(os.path.join(data_dir, file_pattern)))

print(f"Found {len(file_list)} files to process")

Found 8032 files to process


In [8]:
def extract_date(filename):
    # Extract the date part from the filename (e.g., '19980101' from '3B42_Daily.19980101.7.nc4')
    match = re.search(r'3B42_Daily\.(\d{8})\.7\.nc4', os.path.basename(filename))
    if match:
        date_str = match.group(1)
        # Convert to datetime
        return datetime.strptime(date_str, '%Y%m%d')
    return None

In [9]:
datasets = []
dates = []

# Process files in batches to avoid memory issues
batch_size = 400  # Adjust based on your available memory
num_batches = (len(file_list) + batch_size - 1) // batch_size

print(f"Processing files in {num_batches} batches of {batch_size}")

Processing files in 21 batches of 400


In [10]:
combined_dataset = None

for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(file_list))
    
    batch_files = file_list[start_idx:end_idx]
    batch_datasets = []
    batch_dates = []
    
    print(f"Processing batch {batch_idx+1}/{num_batches} ({len(batch_files)} files)")
    
    for file_path in batch_files:
        # Extract date from filename
        date = extract_date(file_path)
        if date is None:
            print(f"Warning: Could not extract date from {file_path}, skipping")
            continue
        
        # Open the dataset
        try:
            ds = xr.open_dataset(file_path)
            # Add the file's date as a coordinate
            ds = ds.expand_dims(time=[date])
            batch_datasets.append(ds)
            batch_dates.append(date)
        except Exception as e:
            print(f"Error opening {file_path}: {e}")
    
    if batch_datasets:
        # Combine all datasets in the current batch
        print(f"Combining {len(batch_datasets)} datasets in batch {batch_idx+1}")
        batch_combined = xr.concat(batch_datasets, dim='time')
        
        # Close all individual datasets to free memory
        for ds in batch_datasets:
            ds.close()
        
        if combined_dataset is None:
            combined_dataset = batch_combined
        else:
            # Merge with the previous batches
            combined_dataset = xr.concat([combined_dataset, batch_combined], dim='time')
            batch_combined.close()
        
        print(f"Completed batch {batch_idx+1}")

if combined_dataset is not None:
    # Sort the dataset by time if needed
    combined_dataset = combined_dataset.sortby('time')
    
    # Add attributes to the time dimension
    combined_dataset.time.attrs['standard_name'] = 'time'
    combined_dataset.time.attrs['long_name'] = 'time'
    
    # Save the combined dataset to a new NetCDF file
    output_file = os.path.join(data_dir, 'combined_3B42_Daily.nc')
    print(f"Saving combined dataset to {output_file}")
    
    # Set compression for all variables to save space
    encoding = {var: {'zlib': True, 'complevel': 5} for var in combined_dataset.data_vars}
    
    # Add encoding for the time dimension
    encoding['time'] = {'units': 'days since 1990-01-01', 'calendar': 'standard'}
    
    # Save the file
    combined_dataset.to_netcdf(
        output_file, 
        format='NETCDF4', 
        encoding=encoding
    )
    
    print("Done! Combined file created successfully.")
    print(f"Time range: {combined_dataset.time.values.min()} to {combined_dataset.time.values.max()}")
    print(f"Shape of combined dataset: {combined_dataset.dims}")
else:
    print("No datasets were processed successfully.")

Processing batch 1/21 (400 files)
Combining 400 datasets in batch 1
Completed batch 1
Processing batch 2/21 (400 files)
Combining 400 datasets in batch 2
Completed batch 2
Processing batch 3/21 (400 files)
Combining 400 datasets in batch 3
Completed batch 3
Processing batch 4/21 (400 files)
Combining 400 datasets in batch 4
Completed batch 4
Processing batch 5/21 (400 files)
Combining 400 datasets in batch 5
Completed batch 5
Processing batch 6/21 (400 files)
Combining 400 datasets in batch 6
Completed batch 6
Processing batch 7/21 (400 files)
Combining 400 datasets in batch 7
Completed batch 7
Processing batch 8/21 (400 files)
Combining 400 datasets in batch 8
Completed batch 8
Processing batch 9/21 (400 files)
Combining 400 datasets in batch 9
Completed batch 9
Processing batch 10/21 (400 files)
Combining 400 datasets in batch 10
Completed batch 10
Processing batch 11/21 (400 files)
Combining 400 datasets in batch 11
Completed batch 11
Processing batch 12/21 (400 files)
Combining 400

MemoryError: Unable to allocate 10.3 GiB for an array with shape (4800, 1440, 400) and data type float32