In [None]:
# -----********************-----

# Created Time: 2024/09/21
# Updated Time: 2025/07/12

# Author: Tara Liu, Yiyi He

### Use Case

# This notebook completes the following tasks:
# 1. Reads downloaded GRIB files and join weather attributes by location to each station,
# 2. Assigns grid data (df) to stations from station_df based on the nearest grid point.

# Then we processes hourly ERA5 climate data and hourly ESMI voltage data and created merged datasets
# 3. Combine hourly ERA5 climate data at all stations between 2013 to 2024
# 4. Combine hourly voltage data at all stations
# 5. Merge climate and voltage data (inner and outer)

### Climate variables:
# t2m: Temperature of air at 2m above the surface of land, sea or in-land waters.
# u10: Eastward component of the 10m wind.
# v10: Northward component of the 10m wind.
# tp: Total precipitation. Accumulated liquid and frozen water, including rain and snow, that falls to the Earth's surface.

### Method:
# scipy.spatial.KDTree for nearest neighbor searches

# -----********************-----

# Libraries

In [3]:
# Import libraries
import pandas as pd
import pygrib
import os
import numpy as np
import cfgrib 
import xarray as xr
import datetime
from datetime import datetime
from scipy.spatial import KDTree
from tqdm import tqdm

# ERA5-land

## Get ERA5-land info at station location

In [None]:
#
def read_grib_file(file_path):
	grbs = pygrib.open(file_path)
	for grb in grbs:
		print(grb)
	grbs.close()
	
	return grbs

In [35]:
def build_kdtree(df):
    """
    Build a KDTree from the gridded dataset.
    
    Args:
        df (pd.DataFrame): DataFrame containing the grib data with lat and lon as the index.

    Returns:
        KDTree: A KDTree built on lat/lon coordinates.
        np.array: Grid points as an array of lat/lon pairs.
    """
    # Ensure lat/lon values are floats and create an array of lat/lon pairs
    grid_points = np.array(list(zip(df.index.get_level_values('latitude').astype(float), df.index.get_level_values('longitude').astype(float))))
    
    # Build and return the KDTree
    return KDTree(grid_points), grid_points

## Climate data at station by datetime

In [39]:
# Import station locations
station_file='/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/station.csv'
station_df = pd.read_csv(station_file)

In [40]:
for year in range(2024,2026):
    input_dir = f'/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/raw/ERA5_GRIB/{year}'
    output_dir = f'/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/station_by_datetime_csv/{year}'
    print(year)
    # Check if file already exists
    existing_files = set()
    for filename in os.listdir(output_dir):
        if filename.endswith('.csv'):
            # Extract the date and time from the filename (assuming format: era5_land_YYYYMMDD_HHMM.grib)
            date_part = filename.split('_')[0]  # Extract YYYYMMDD part
            time_part = filename.split('_')[1]  # Extract HHMM part
            existing_files.add(f"{date_part}_{time_part}")
    
    for file in os.listdir(input_dir):
        if file.endswith('.grib'):
            # print(file)
            grbs = read_grib_file(os.path.join(input_dir, file))
            
            date = file.split('_')[2]  # Extract YYYYMMDD part
            time = file.split('_')[3].replace('.grib', '')  # Extract HHMM part
            date_time = date+'_'+time
            
            if date_time in existing_files:
                continue

            else:
                
                with xr.open_dataset(os.path.join(input_dir, file)) as ds:
                    df_raw = ds.to_dataframe()
                    df = df_raw.drop(columns=['number', 'time', 'step', 'surface', 'valid_time'])
                    # get date and time
                    date = file.split('_')[2]
                    time = file.split('_')[3].split('.')[0]
                    
                    df['date'] = date
                    df['time'] = time
                    # build KDTree
                    tree, grid_points = build_kdtree(df)
        
                    station_coords = station_df[['Lat', 'Lon']].to_numpy()
        
                    _, nearest_grid_idx = tree.query(station_coords)
        
                    nearest_grid_points = grid_points[nearest_grid_idx]
                    
                    nearest_grid_points = [(round(lat, 3), round(lon, 3)) for lat, lon in nearest_grid_points]
        
                    df.index = pd.MultiIndex.from_tuples([(round(lat, 3), round(lon, 3)) for lat, lon in df.index], names=['lat', 'lon'])
        
                    nearest_grid_values = df.loc[nearest_grid_points].reset_index()
        
                    station_df_final = pd.concat([station_df.reset_index(drop=True), nearest_grid_values[['t2m', 'u10', 'v10','tp','date','time']]], axis=1)
        
                    station_df_final.to_csv(f'{output_dir}/{date}_{time}_station.csv') 

2024
1:2 metre temperature:K (instant):regular_ll:surface:level 0:fcst time 17 hrs:from 202407310000
2:10 metre U wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 17 hrs:from 202407310000
3:10 metre V wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 17 hrs:from 202407310000
4:Total precipitation:m (accum):regular_ll:surface:level 0:fcst time 16-17 hrs (accum):from 202407310000
1:2 metre temperature:K (instant):regular_ll:surface:level 0:fcst time 7 hrs:from 202403190000
2:10 metre U wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 7 hrs:from 202403190000
3:10 metre V wind component:m s**-1 (instant):regular_ll:surface:level 0:fcst time 7 hrs:from 202403190000
4:Total precipitation:m (accum):regular_ll:surface:level 0:fcst time 6-7 hrs (accum):from 202403190000
1:2 metre temperature:K (instant):regular_ll:surface:level 0:fcst time 14 hrs:from 202403020000
2:10 metre U wind component:m s**-1 (instant):regular_ll:surface:level

## Station's climate data by year

In [None]:
# Create output directory/folders
output_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/station_climate_by_year'
for year in range(2024, 2026):
    os.makedirs(os.path.join(output_dir, str(year)), exist_ok=True)  # exist_ok=True avoids error if the folder already exists

In [41]:
# Input directory
input_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/station_by_datetime_csv'
# Output directory
output_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/station_climate_by_year'
# Initiate column names for the output dataframe
column_names = ['Location name', 'ESMI_ID', 'From date', 'To date',
       'District', 'State', 'Category', 'Connection Type', 'Lat', 'Lon', 't2m',
       'u10', 'v10', 'tp', 'date', 'time']

# Iterate through years
for year_folder in os.listdir(input_dir):
    if year_folder in [str(year) for year in range(2024, 2026)]:
        year = int(year_folder)
        print(f'I am working on year {year}')
        # Initiate a dictionary that will store hourly climate data for each station
        station_climate = {}
        # Iterate through hours in a year
        for hour_csv in tqdm(os.listdir(os.path.join(input_dir, year_folder))):
            if hour_csv.endswith('.csv'): # making sure it is a csv file
                # Read csv as pandas dataframe
                df_raw = pd.read_csv(os.path.join(input_dir, year_folder, hour_csv), index_col=0)
                # Extract data from each row and populate the station_climate dictionary. Key: Station id; Value: nd array of hourly climate variables
                for row in range(df_raw.shape[0]):
                    # Extract station id
                    station_id = df_raw.iloc[row].values[0]
                    # Check if the station ID exists in the dictionary as key
                    if station_id in station_climate:
                        existing_climate_array = station_climate[station_id]
                        station_climate[station_id] = np.concatenate(
                            (
                                existing_climate_array,
                            df_raw.iloc[row].values[1:].reshape(1, 16)
                            ),
                            axis=0
                        )
                    else:
                        station_climate[station_id] = df_raw.iloc[row].values[1:].reshape(1, 16)

        for station in station_climate.keys():
            station_year_df = pd.DataFrame(station_climate[station], columns=column_names)
            station_year_df.to_csv(os.path.join(output_dir, str(year), f'station_{station}_{year}.csv'))
    else:
        continue

I am working on year 2025


100%|██████████| 4467/4467 [12:03<00:00,  6.17it/s]


I am working on year 2024


100%|██████████| 8785/8785 [46:13<00:00,  3.17it/s]


# Combine ERA5 at station

In [42]:
# Combine ERA5 climate data into one csv file
input_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/station_climate_by_year/'
folders = os.listdir(input_dir)
df_climate = pd.DataFrame()
for folder in tqdm(folders):
    if not folder.startswith('.'):
        files = os.listdir(input_dir + '/' + folder)
        for file in files:
            if file.endswith('.csv'):
                df = pd.read_csv(input_dir + '/' + folder + '/' + file, index_col=0)
                # add station_id, which is in the file name
                df['station_id'] = file.split('_')[1]
                df_climate = pd.concat([df_climate, df])
            else:
                continue
    else:
        continue
df_climate.drop(['From date', 'To date'], axis=1, inplace=True)
df_climate['date'] = df_climate['date'].astype(str)
df_climate['time'] = df_climate['time'].astype(str).apply(lambda x: x.zfill(4))
df_climate['datetime'] = pd.to_datetime(df_climate['date'] + df_climate['time'], format='%Y%m%d%H%M')
df_climate.set_index('datetime', inplace=True)
df_climate.to_csv("/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/csv/df_climate.csv")

100%|██████████| 14/14 [5:47:30<00:00, 1489.31s/it]  


# Combine ERA5 and ESMI Outage data

## Hourly dataset

Combine hourly outage csv files into one:

In [43]:
# Process hourly voltage data into one csv file
input_dir = "/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/india_hourly"
files = os.listdir(input_dir)
df_blackout = pd.DataFrame()

for file in tqdm(files):
    if file.endswith('.csv'):
        df = pd.read_csv(input_dir + '/' + file)
        df['station_id'] = file.split('_')[-1].split('.')[0]
        df_blackout = pd.concat([df_blackout, df])
    else:
        continue

df_blackout['hour'] = pd.to_datetime(df_blackout['hour'], format='%Y-%m-%d %H:%M:%S')
df_blackout.set_index('hour', inplace=True)
# Save combined voltage data into one csv
df_blackout.to_csv("/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/processed/csv/df_blackout.csv")

100%|██████████| 537/537 [00:24<00:00, 21.95it/s]


Now we are ready to combine the climate dataframe with hourly outage dataframe

In [4]:
home_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/'
df_blackout_537 = pd.read_csv(home_dir +'01_data/processed/csv/df_blackout.csv')
df_climate_538 = pd.read_csv(home_dir +'01_data/processed/csv/df_climate.csv')

In [5]:
# Some stations fall outside the bounds of the ERA5 dataset, we therefore remove them
# Note: these stations does not appear in the GIS shapefiles so none of the visualizations would include them
station_ids_out_bnds = [7, 143, 156, 159, 160, 261, 328, 353, 358, 450, 461, 472, 475, 476, 501, 506, 536, 546]
df_blackout_519 = df_blackout_537[~df_blackout_537.station_id.isin(station_ids_out_bnds)]
df_climate_520 = df_climate_538[~df_climate_538.station_id.isin(station_ids_out_bnds)]

# Double check the unique number of stations after stations
print('The number of unique stations in df_blackout_519 is ', len(df_blackout_519.station_id.unique()))
print('The number of unique stations in df_climate_520 is ', len(df_climate_520.station_id.unique()))

The number of unique stations in df_blackout_519 is  519
The number of unique stations in df_climate_520 is  520


In [6]:
# Merge Voltage dataframe with Climate dataframe
home_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/'
# First, reset the index to make the datetime index a regular column temporarily.
df_climate_reset = df_climate_520.reset_index()
df_blackout_reset = df_blackout_519.reset_index()

# Rename column
df_blackout_reset.rename(columns={"hour": "datetime"}, inplace=True)

# Merge based on 'station_id' and 'datetime' (which was previously the index).
df_merged_inner_519 = pd.merge(df_climate_reset, df_blackout_reset, on=['station_id', 'datetime'], how='inner')

# Calculate wind speed based on the two wind vectors u and v
df_merged_inner_519['wind_speed'] = df_merged_inner_519.apply(lambda row: np.sqrt(row['u10']**2 + row['v10']**2), axis=1)

# Save to csv file
# df_merged_inner_519.to_csv(home_dir + "01_data/processed/csv/df_merged_inner_519.csv")

Prepare hourly data with blackout percentage and 3 weather variables

In [8]:
# Clean and create hourly data in preparation for model
home_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/'
# Select key variables
hourly_519station_3weather = df_merged_inner_519[['station_id', 'datetime', 'pct_blackout', 't2m', 'tp', 'wind_speed']]
# Save to new csv
hourly_519station_3weather.to_csv(home_dir + "01_data/processed/csv/hourly_519station_3weather.csv")

## Daily dataset

In [23]:
home_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/'
hourly_df = pd.read_csv(home_dir + "01_data/processed/csv/hourly_519station_3weather.csv", index_col=0)

hourly_df['datetime'] = pd.to_datetime(hourly_df['datetime'])
hourly_df['date'] = hourly_df['datetime'].dt.date

hourly_df['blackout_time_minutes'] = hourly_df['pct_blackout'] * 60

daily_agg = (
    hourly_df.groupby(['station_id', 'date'])
    .agg(
        t2m_mean=('t2m', 'mean'),
        t2m_max=('t2m', 'max'),
        t2m_min=('t2m', 'min'),
        t2m_median=('t2m', 'median'),
        tp_mean=('tp', 'mean'),
        tp_max=('tp', 'max'),
        tp_min=('tp', 'min'),
        tp_median=('tp', 'median'),
        tp_sum=('tp', 'sum'),
        wind_speed_mean=('wind_speed', 'mean'),
        wind_speed_max=('wind_speed', 'max'),
        wind_speed_min=('wind_speed', 'min'),
        wind_speed_median=('wind_speed', 'median'),
        daily_blackout_minutes=('blackout_time_minutes', 'sum')
    )
    .reset_index()
)
daily_agg.to_csv(home_dir + "01_data/processed/csv/daily_519station_13weather.csv")