In [None]:
# -----********************-----

# Created Time: 2024/09/21

# Author: Tara Liu, Yiyi He

### Use Case

# This notebook reads downloaded GRIB files and join weather attributes by location to each station,
#  assigning grid data (df) to stations from station_df based on the nearest grid point.

### Climate variables:
# t2m: Temperature of air at 2m above the surface of land, sea or in-land waters.
# u10: Eastward component of the 10m wind.
# v10: Northward component of the 10m wind.
# tp: Total precipitation. Accumulated liquid and frozen water, including rain and snow, that falls to the Earth's surface.

### Method:
# scipy.spatial.KDTree for nearest neighbor searches

# -----********************-----

# Libraries

In [1]:
# Import libraries
import pandas as pd
import pygrib
import os
import numpy as np
import cfgrib 
import xarray as xr
import datetime
from datetime import datetime
from scipy.spatial import KDTree

# ERA5-land

In [2]:
def read_grib_file(file_path):
	grbs = pygrib.open(file_path)
	for grb in grbs:
		print(grb)
	grbs.close()
	
	return grbs

In [3]:
def build_kdtree(df):
    """
    Build a KDTree from the gridded dataset.
    
    Args:
        df (pd.DataFrame): DataFrame containing the grib data with lat and lon as the index.

    Returns:
        KDTree: A KDTree built on lat/lon coordinates.
        np.array: Grid points as an array of lat/lon pairs.
    """
    # Ensure lat/lon values are floats and create an array of lat/lon pairs
    grid_points = np.array(list(zip(df.index.get_level_values('latitude').astype(float), df.index.get_level_values('longitude').astype(float))))
    
    # Build and return the KDTree
    return KDTree(grid_points), grid_points

In [4]:
# Import station locations
station_file='/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/station.csv'
station_df = pd.read_csv(station_file)

In [38]:
for year in range(2024,2026):
    input_dir = f'/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/raw/ERA5_GRIB/{year}'
    output_dir = f'/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/station_by_datetime_csv/{year}'
    print(year)
    # Check if file already exists
    existing_files = set()
    for filename in os.listdir(output_dir):
        if filename.endswith('.csv'):
            # Extract the date and time from the filename (assuming format: era5_land_YYYYMMDD_HHMM.grib)
            date_part = filename.split('_')[0]  # Extract YYYYMMDD part
            time_part = filename.split('_')[1]  # Extract HHMM part
            existing_files.add(f"{date_part}_{time_part}")
    
    for file in os.listdir(input_dir):
        if file.endswith('.grib'):
            # print(file)
            grbs = read_grib_file(os.path.join(input_dir, file))
            
            date = file.split('_')[2]  # Extract YYYYMMDD part
            time = file.split('_')[3].replace('.grib', '')  # Extract HHMM part
            date_time = date+'_'+time
            
            if date_time in existing_files:
                continue

            else:
                
                with xr.open_dataset(os.path.join(input_dir, file)) as ds:
                    df_raw = ds.to_dataframe()
                    df = df_raw.drop(columns=['number', 'time', 'step', 'surface', 'valid_time'])
                    # get date and time
                    date = file.split('_')[2]
                    time = file.split('_')[3].split('.')[0]
                    
                    df['date'] = date
                    df['time'] = time
                    # build KDTree
                    tree, grid_points = build_kdtree(df)
        
                    station_coords = station_df[['Lat', 'Lon']].to_numpy()
        
                    _, nearest_grid_idx = tree.query(station_coords)
        
                    nearest_grid_points = grid_points[nearest_grid_idx]
                    
                    nearest_grid_points = [(round(lat, 3), round(lon, 3)) for lat, lon in nearest_grid_points]
        
                    df.index = pd.MultiIndex.from_tuples([(round(lat, 3), round(lon, 3)) for lat, lon in df.index], names=['lat', 'lon'])
        
                    nearest_grid_values = df.loc[nearest_grid_points].reset_index()
        
                    station_df_final = pd.concat([station_df.reset_index(drop=True), nearest_grid_values[['t2m', 'u10', 'v10','tp','date','time']]], axis=1)
        
                    station_df_final.to_csv(f'{output_dir}/{date}_{time}_station.csv') 

Can't create file '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/raw/ERA5_GRIB/2024/era5_land_20241103_0500.grib.5b7b6.idx'
Traceback (most recent call last):
  File "/Applications/anaconda3/envs/india0/lib/python3.9/site-packages/cfgrib/messages.py", line 273, in itervalues
    yield self.filestream.message_from_file(file, errors=errors)
  File "/Applications/anaconda3/envs/india0/lib/python3.9/site-packages/cfgrib/messages.py", line 340, in message_from_file
    return Message.from_file(file, offset, **kwargs)
  File "/Applications/anaconda3/envs/india0/lib/python3.9/site-packages/cfgrib/messages.py", line 104, in from_file
    raise EOFError("End of file: %r" % file)
EOFError: End of file: <_io.BufferedReader name='/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/raw/ERA5_GRIB/2024/era5_land_20241103_0500.grib'>

During handling of the above exception, another e

2024
1:2 metre temperature:K (instant):regular_ll:surface:level 0:fcst time 17 hrs:from 202407310000
2:10 metre U wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 17 hrs:from 202407310000
3:10 metre V wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 17 hrs:from 202407310000
4:Total precipitation:m (accum):regular_ll:surface:level 0:fcst time 16-17 hrs (accum):from 202407310000
1:2 metre temperature:K (instant):regular_ll:surface:level 0:fcst time 7 hrs:from 202403190000
2:10 metre U wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 7 hrs:from 202403190000
3:10 metre V wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 7 hrs:from 202403190000
4:Total precipitation:m (accum):regular_ll:surface:level 0:fcst time 6-7 hrs (accum):from 202403190000
1:2 metre temperature:K (instant):regular_ll:surface:level 0:fcst time 14 hrs:from 202403020000
2:10 metre U wind component:m s**-1 (instant):regular_ll:surface:level

EOFError: No valid message found: '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/raw/ERA5_GRIB/2024/era5_land_20241103_0500.grib'

In [27]:
my_path = "/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/raw/ERA5_GRIB/2025/era5_land_20250101_0000.grib"

with xr.open_dataset(my_path) as ds:
    print(ds)

<xarray.Dataset> Size: 674kB
Dimensions:     (latitude: 187, longitude: 224)
Coordinates:
    number      int64 8B ...
    time        datetime64[ns] 8B ...
    step        timedelta64[ns] 8B ...
    surface     float64 8B ...
  * latitude    (latitude) float64 1kB 31.03 30.93 30.83 ... 12.63 12.53 12.43
  * longitude   (longitude) float64 2kB 72.47 72.57 72.67 ... 94.57 94.67 94.78
    valid_time  datetime64[ns] 8B ...
Data variables:
    t2m         (latitude, longitude) float32 168kB ...
    u10         (latitude, longitude) float32 168kB ...
    v10         (latitude, longitude) float32 168kB ...
    tp          (latitude, longitude) float32 168kB ...
Attributes:
    GRIB_edition:            1
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 202