In [35]:
# -----********************-----

# Created Time: 2024/10/09

# Last updated: 2024/10/17

# Author: Tara Liu, Yiyi He

### Use Case

# This notebook processes hourly ERA5 climate data and hourly ESMI voltage data and created merged datasets
# 1. Combine hourly ERA5 climate data at all stations between 2013 to 2024
# 2. Combine hourly voltage data at all stations
# 3. Merge climate and voltage data (inner and outer)

# -----********************-----

In [1]:
# Import libraries
import os
import pandas as pd
from tqdm import tqdm

In [None]:
# Combine ERA5 climate data into one csv file
input_dir = 'data/station_climate_by_year/'
folders = os.listdir(input_dir)
df_climate = pd.DataFrame()
for folder in tqdm(folders):
    if not folder.startswith('.'):
        files = os.listdir(input_dir + '/' + folder)
        for file in files:
            if file.endswith('.csv'):
                df = pd.read_csv(input_dir + '/' + folder + '/' + file, index_col=0)
                # add station_id, which is in the file name
                df['station_id'] = file.split('_')[1]
                df_climate = pd.concat([df_climate, df])
            else:
                continue
    else:
        continue
df_climate.drop(['From date', 'To date'], axis=1, inplace=True)
df_climate['date'] = df_climate['date'].astype(str)
df_climate['time'] = df_climate['time'].astype(str).apply(lambda x: x.zfill(4))
df_climate['datetime'] = pd.to_datetime(df_climate['date'] + df_climate['time'], format='%Y%m%d%H%M')
df_climate.set_index('datetime', inplace=True)
df_climate.to_csv("df_climate.csv")

In [57]:
# Process hourly voltage data into one csv file
input_dir = "/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/final_verification/india_processing/india_hourly"
files = os.listdir(input_dir)
df_blackout = pd.DataFrame()

for file in tqdm(files):
    if file.endswith('.csv'):
        df = pd.read_csv(input_dir + '/' + file)
        df['station_id'] = file.split('_')[-1].split('.')[0]
        df_blackout = pd.concat([df_blackout, df])
    else:
        continue

df_blackout['hour'] = pd.to_datetime(df_blackout['hour'], format='%Y-%m-%d %H:%M:%S')
df_blackout.set_index('hour', inplace=True)
# Save combined voltage data into one csv
df_blackout.to_csv("df_blackout.csv")

100%|█████████████████████████████████████████| 536/536 [01:51<00:00,  4.79it/s]


In [5]:
# Read climate and voltage dataframes

df_blackout_536 = pd.read_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/df_blackout.csv')
df_climate_538 = pd.read_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/df_climate.csv')

In [11]:
station_ids_out_bnds = [7, 143, 156, 159, 160, 261, 328, 353, 358, 450, 461, 472, 475, 476, 501, 506, 536, 546]
df_blackout_518 = df_blackout_536[~df_blackout_536.station_id.isin(station_ids_out_bnds)]
df_climate_520 = df_climate_538[~df_climate_538.station_id.isin(station_ids_out_bnds)]

In [14]:
# Merge Voltage dataframe with Climate dataframe

# First, reset the index to make the datetime index a regular column temporarily.
df_climate_reset = df_climate_520.reset_index()
df_blackout_reset = df_blackout_518.reset_index()

# Rename column
df_blackout_reset.rename(columns={"hour": "datetime"}, inplace=True)

# Merge based on 'station_id' and 'datetime' (which was previously the index).
df_merged_inner_518 = pd.merge(df_climate_reset, df_blackout_reset, on=['station_id', 'datetime'], how='inner')

# Save outputs to csv files
df_merged_inner_518.to_csv("df_merged_inner_518.csv")