## Use Case

This notebook reads downloaded GRIB files and join weather attributes by location to each station, assigning grid data (df) to stations from station_df based on the nearest grid point.

Main function: assign_nearest_grid_values(df, station_df, grid_points, nearest_grid_idx)

Method: scipy.spatial.KDTree for nearest neighbor searches

In [10]:
year=2013

In [3]:
import pandas as pd
import pygrib
import os
import numpy as np
import cfgrib 
import xarray as xr
from scipy.spatial import KDTree

## Main Function

In [5]:
def build_kdtree(df):
    """
    Build a KDTree from the gridded dataset.
    
    Args:
        df (pd.DataFrame): DataFrame containing the grib data with lat and lon as the index.

    Returns:
        KDTree: A KDTree built on lat/lon coordinates.
        np.array: Grid points as an array of lat/lon pairs.
    """
    # Ensure lat/lon values are floats and create an array of lat/lon pairs
    grid_points = np.array(list(zip(df.index.get_level_values('latitude').astype(float), df.index.get_level_values('longitude').astype(float))))
    
    # Build and return the KDTree
    return KDTree(grid_points), grid_points


In [6]:
def find_nearest_grid_points(tree, station_df):
    """
    Find the nearest grid points for each station using the KDTree.
    
    Args:
        tree (KDTree): The KDTree built from grid points.
        station_df (pd.DataFrame): DataFrame containing station coordinates with 'station_lat' and 'station_lon' columns.

    Returns:
        np.array: Nearest grid points for each station in station_df.
    """
    # Extract station coordinates as numpy array
    station_coords = station_df[['Lat', 'Lon']].to_numpy()
    
    # Query the KDTree for nearest grid points
    _, nearest_grid_idx = tree.query(station_coords)
    
    return nearest_grid_idx


In [7]:
def assign_nearest_grid_values(df, station_df, grid_points, nearest_grid_idx):
    """
    Assign the nearest grid data (t2m, wind_u, wind_v) to each station in station_df.
    
    Args:
        df (pd.DataFrame): The grib data with lat/lon index and variables like t2m, wind_u, wind_v.
        station_df (pd.DataFrame): DataFrame containing station coordinates.
        grid_points (np.array): Grid points from the KDTree.
        nearest_grid_idx (np.array): Indices of the nearest grid points for each station.

    Returns:
        pd.DataFrame: station_df with grid data (t2m, wind_u, wind_v) appended.
    """
    # Get the nearest grid points as lat/lon tuples
    nearest_grid_points = [(round(lat, 3), round(lon, 3)) for lat, lon in grid_points[nearest_grid_idx]]
    
    # Ensure the index of df is also rounded and in tuple form
    df.index = pd.MultiIndex.from_tuples(
        [(round(lat, 3), round(lon, 3)) for lat, lon in df.index], 
        names=['lat', 'lon']
    )
    
    # Extract the grid data for the nearest points
    nearest_grid_values = df.loc[nearest_grid_points].reset_index()
    
    # Join the nearest grid values with the station data
    station_df = pd.concat([station_df.reset_index(drop=True), nearest_grid_values[['t2m', 'u10', 'v10','tp']]], axis=1)
    
    return station_df


In [13]:
def read_grib_file(file_path):
	grbs = pygrib.open(file_path)
	for grb in grbs:
		print(grb)
	grbs.close()
	
	return grbs

In [55]:
input_dir = f'{year}'
station_file='STATION.csv'
station_df = pd.read_csv(station_file)
output_dir = f'station_by_datetime_csv/{year}'

for file in os.listdir(input_dir):
    if file.endswith('.grib'):
        print(file)
        grbs = read_grib_file(os.path.join(input_dir, file))
        
        with xr.open_dataset(os.path.join(input_dir, file)) as ds:
            df_raw = ds.to_dataframe()
            df = df_raw.drop(columns=['number', 'time', 'step', 'surface', 'valid_time'])
            # get date and time
            date = file.split('_')[2]
            time = file.split('_')[3].split('.')[0]
            
            df['date'] = date
            df['time'] = time
            # build KDTree
            tree, grid_points = build_kdtree(df)

            station_coords = station_df[['Lat', 'Lon']].to_numpy()

            _, nearest_grid_idx = tree.query(station_coords)

            nearest_grid_points = grid_points[nearest_grid_idx]
            
            nearest_grid_points = [(round(lat, 3), round(lon, 3)) for lat, lon in nearest_grid_points]

            df.index = pd.MultiIndex.from_tuples([(round(lat, 3), round(lon, 3)) for lat, lon in df.index], names=['lat', 'lon'])

            nearest_grid_values = df.loc[nearest_grid_points].reset_index()

            station_df_final = pd.concat([station_df.reset_index(drop=True), nearest_grid_values[['t2m', 'u10', 'v10','tp','date','time']]], axis=1)

            station_df_final.to_csv(f'{output_dir}/{date}_{time}_station.csv')

era5_land_20130802_2100.grib
1:2 metre temperature:K (instant):regular_ll:surface:level 0:fcst time 21 hrs:from 201308020000
2:10 metre U wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 21 hrs:from 201308020000
3:10 metre V wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 21 hrs:from 201308020000
4:Total precipitation:m (accum):regular_ll:surface:level 0:fcst time 20-21 hrs (accum):from 201308020000
era5_land_20130706_1200.grib
1:2 metre temperature:K (instant):regular_ll:surface:level 0:fcst time 12 hrs:from 201307060000
2:10 metre U wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 12 hrs:from 201307060000
3:10 metre V wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 12 hrs:from 201307060000
4:Total precipitation:m (accum):regular_ll:surface:level 0:fcst time 11-12 hrs (accum):from 201307060000
era5_land_20131229_2300.grib
1:2 metre temperature:K (instant):regular_ll:surface:level 0:fcst time 23 hrs:f

In [47]:
station_df[station_df['station_id'] == 489]

Unnamed: 0,station_id,Location name,ESMI_ID,From date,To date,District,State,Category,Connection Type,Lat,Lon,t2m,u10,v10,tp
455,489,Vanaparty,,10/23/2015 0:00,8/25/2018 21:59,Warangal,Telangana,District Headquarters,Domestic,17.956816,79.602123,297.868408,2.616627,0.989978,0.002155


In [39]:
print(tree)

<scipy.spatial._kdtree.KDTree object at 0x3de33c3c0>


In [41]:
grid_points.shape

(41888, 2)

In [44]:
nearest_grid_idx.shape

(538,)

In [35]:
print(df.head(10))

                           t2m       u10       v10        tp      date  time
latitude longitude                                                          
31.032   72.475     300.255127 -1.978100 -1.211438  0.006685  20130802  2100
         72.575     300.247314 -2.019360 -1.190198  0.006008  20130802  2100
         72.675     300.222900 -2.108715 -1.147107  0.005398  20130802  2100
         72.775     300.167236 -2.232494 -1.089002  0.004810  20130802  2100
         72.875     300.144775 -2.368969 -1.029065  0.004218  20130802  2100
         72.975     300.111572 -2.486401 -0.975110  0.003846  20130802  2100
         73.075     299.937744 -2.575024 -0.948255  0.003750  20130802  2100
         73.175     299.839111 -2.652905 -0.917249  0.003808  20130802  2100
         73.275     299.814697 -2.727612 -0.905530  0.004418  20130802  2100
         73.375     299.803955 -2.799145 -0.903333  0.006137  20130802  2100


In [26]:
input_dir = f'{year}'
station_file='STATION.csv'
station_df = pd.read_csv(station_file)
output_dir = f'station_by_datetime_csv/{year}'

for file in os.listdir(input_dir):
    
    if  file.endswith('.grib'):
        print(file)

		grbs = read_grib_file(os.path.join(input_dir, file))
		
		with xr.open_dataset(os.path.join(input_dir, file)) as ds:
			df = ds.to_dataframe()
		
			df = df.drop(columns=['number', 'time', 'step', 'surface', 'valid_time'])
			# get date and time
			date = file.split('_')[2]
			time = file.split('_')[3].split('.')[0]
			# add date and time to df
			
			df['date'] = date
			df['time'] = time

			print(df.head(10))

			
			# build KDTree
			tree, grid_points = build_kdtree(df)
			# find nearest grid points
			nearest_grid_idx = find_nearest_grid_points(tree, station_df)
			# assign nearest grid values

			station_df = assign_nearest_grid_values(df, station_df, grid_points, nearest_grid_idx)

			# save to csv
			station_df.to_csv(f'{output_dir}/{date}_{time}_station.csv')

			print(f'{output_dir}/{date}_{time}_station.csv')

			print('done')

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 43)

In [17]:
file

'era5_land_20130802_2100.grib'

In [None]:
xr.open_dataset(file)

In [None]:

			tree, grid_points = build_kdtree(df)
			# find nearest grid points
			nearest_grid_idx = find_nearest_grid_points(tree, station_df)
			# assign nearest grid values

			station_df = assign_nearest_grid_values(df, station_df, grid_points, nearest_grid_idx)

			# save to csv
			station_df.to_csv(f'{output_dir}/{date}_{time}_station.csv')

			print(f'{output_dir}/{date}_{time}_station.csv')

			print('done')


In [12]:
import os
import pandas as pd

# Define the input and output directories
input_base_dir = '../../data/GRIB'
output_base_dir = '../../data/csv'

# Function to process each file
def process_grib_file(grib_file, station_df, df):
    """
    Process a single GRIB file: find nearest grid points and save station data to CSV.
    
    Args:
        grib_file (str): The path to the GRIB file.
        station_df (pd.DataFrame): The station data DataFrame.
        df (pd.DataFrame): The grid data DataFrame extracted from the GRIB file.
    
    Returns:
        pd.DataFrame: Station data with grid attributes for the given file.
    """
    # Here you should load the GRIB data into `df`
    # Placeholder for GRIB loading logic
    # df = load_grib_file(grib_file)  # You need to implement this function
    
    # Build the KDTree and process the stations
    tree, grid_points = build_kdtree(df)
    nearest_grid_idx = find_nearest_grid_points(tree, station_df)
    station_df_with_grid_data = assign_nearest_grid_values(df, station_df, grid_points, nearest_grid_idx)
    
    return station_df_with_grid_data

# Loop through each year folder
for year in range(2022, 2022):
    input_year_dir = os.path.join(input_base_dir, str(year))
    output_year_dir = os.path.join(output_base_dir, str(year))
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_year_dir, exist_ok=True)
    
    # Loop through all GRIB files in the current year folder
    for grib_file in os.listdir(input_year_dir):
        if grib_file.endswith('.grib'):
            # Extract the timestamp (YYYYMMDD_HHMM) from the filename
            timestamp = grib_file.replace('era5_land_', '').replace('.grib', '')
            
            # Construct the output CSV file path
            output_csv_file = os.path.join(output_year_dir, f"era5_land_{timestamp}.csv")
            
            # Full path to the current GRIB file
            grib_file_path = os.path.join(input_year_dir, grib_file)
            
            # Placeholder for station_df (you need to load station data before the loop)
            station_df = load_station_data()  # Assuming you have station data loaded here
            
            # Placeholder for grib_df (grid data loaded from GRIB file)
            grib_df = load_grib_data(grib_file_path)  # You need to implement this function
            
            # Process the GRIB file and get the output
            station_df_with_grid_data = process_grib_file(grib_file_path, station_df, grib_df)
            
            # Save the resulting DataFrame to CSV
            station_df_with_grid_data.to_csv(output_csv_file, index=False)

            print(f"Processed and saved: {output_csv_file}")


In [None]:
# Step 1: Extract the grid coordinates from the grib data
grid_points = np.array(list(zip(df.index.get_level_values('latitude'), df.index.get_level_values('longitude'))))

# Step 2: Build the KDTree
tree = KDTree(grid_points)

# Step 3: For each station coordinate, find the nearest grid point
station_coords = station_df[['Lat', 'Lon']].to_numpy()

# Query the nearest point in the KDTree
_, nearest_grid_idx = tree.query(station_coords)

# Step 4: Use the index to get the corresponding grid data
nearest_grid_points = grid_points[nearest_grid_idx]
print(nearest_grid_points)
print(type(nearest_grid_points))

df.index = pd.MultiIndex.from_tuples(
    [(round(lat, 3), round(lon, 3)) for lat, lon in df.index], 
)

df.index

# Step 5: Convert nearest_grid_points to tuples (lat, lon)
nearest_grid_points = [(round(lat, 3), round(lon, 3)) for lat, lon in nearest_grid_points]

# Step 6: Make sure the index of the df is also rounded and in the same format
df.index = pd.MultiIndex.from_tuples(
    [(round(lat, 3), round(lon, 3)) for lat, lon in df.index], 
    names=['lat', 'lon']
)

# Step 7: Use the nearest grid points to get corresponding values from df
# Ensure nearest_grid_points is a list of tuples
nearest_grid_values = df.loc[nearest_grid_points].reset_index()

# Now you can join this with the station_df
station_df = pd.concat([station_df.reset_index(drop=True), nearest_grid_values[['t2m', 'u10', 'v10','tp','date','time']]], axis=1)

station_df.to_csv(f'{output_dir}/{date}_{time}_station.csv')

print(f'{output_dir}/{date}_{time}_station.csv')

print('done')