In [None]:
import netCDF4 as nc
import numpy as np
import pandas as pd
from multiprocessing import Pool

# # Load California coastal grid cell coordinates
california_coords = pd.read_csv(r"C:\Users\jayin\Downloads\era5.csv")

# Define function to extract AR events for a specific year
def extract_ar_events(file_path):
    try:
        dataset = nc.Dataset(file_path)

        # Extract relevant variables
        time_var = dataset.variables['time']
        lat_var = dataset.variables['lat']
        lon_var = dataset.variables['lon']
        ar_var = dataset.variables['ar_binary_tag']

        # Convert time variable to datetime
        time_units = time_var.units
        time_cal = time_var.calendar if hasattr(time_var, 'calendar') else 'standard'
        time_index = nc.num2date(time_var[:], units=time_units, calendar=time_cal)

        # Convert cftime datetime objects to standard datetime objects
        time_index = [pd.Timestamp(t.strftime('%Y-%m-%d %H:%M:%S')) for t in time_index]

        # Create a DataFrame to hold the results
        ar_events = []

        # Vectorized approach to find nearest grid cells
        lat_vals = lat_var[:]
        lon_vals = lon_var[:]

        for _, row in california_coords.iterrows():
            lat = row['Latitude']
            lon = row['Longitude']

            # Find the nearest grid cell indices using vectorized operations
            lat_idx = np.abs(lat_vals - lat).argmin()
            lon_idx = np.abs(lon_vals - lon).argmin()

            # Extract AR events for the grid cell
            ar_data = ar_var[:, lat_idx, lon_idx]

            # Get the timestamps for AR events
            ar_times = np.array(time_index)[ar_data == 1]

            for time in ar_times:
                ar_events.append({'latitude': lat, 'longitude': lon, 'timestamp': time})

        return pd.DataFrame(ar_events)

    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return pd.DataFrame()

# Aggregate results for all years using parallel processing
def process_files(file_path):
    return extract_ar_events(file_path)

def main():
    all_ar_events = pd.DataFrame()

    # Prompt user for file path
    file_path = input("Enter the file path for ERA5 data: ")

    with Pool() as pool:
        try:
            # Process the single file path
            results = pool.map(process_files, [file_path])
            all_ar_events = pd.concat(results)

            # Add 'date' column
            all_ar_events['date'] = all_ar_events['timestamp'].dt.date

            # Group by date and filter for dates with 50% or more AR hours
            ar_daily = all_ar_events.groupby(['latitude', 'longitude', 'date']).size().reset_index(name='count')
            ar_daily_filtered = ar_daily[ar_daily['count'] >= 12]

            # Save the results to a CSV file
            ar_daily_filtered.to_csv('california_ar_events_1980_with_counts.csv', index=False)

        except Exception as e:
            print(f"Error in multiprocessing: {str(e)}")

if __name__ == "__main__":
    main()
