In [35]:
import os
import pandas as pd
from tqdm import tqdm

In [46]:
def read_blackout_data(input_dir):
	files = os.listdir(input_dir)
	print(files)
	df_blackout = pd.DataFrame()

	for file in files:
		df = pd.read_csv(input_dir + '/' + file)
		df['station_id'] = file.split('_')[-1].split('.')[0]
		df_blackout = pd.concat([df_blackout, df])

	df_blackout['hour'] = pd.to_datetime(
		df_blackout['hour'], format='%Y-%m-%d %H:%M:%S')

	df_blackout.set_index('hour', inplace=True)
	print('Finished reading data...info below:')
	print(df_blackout.info())
	print('index:', df_blackout.index)
	return df_blackout

# Process climate dataset into one csv file

In [40]:
input_dir = 'data/station_climate_by_year/'
folders = os.listdir(input_dir)
df_climate = pd.DataFrame()
for folder in tqdm(folders):
    if not folder.startswith('.'):
        files = os.listdir(input_dir + '/' + folder)
        for file in files:
            if file.endswith('.csv'):
                df = pd.read_csv(input_dir + '/' + folder + '/' + file, index_col=0)
                # add station_id, which is in the file name
                df['station_id'] = file.split('_')[1]
                df_climate = pd.concat([df_climate, df])
            else:
                continue
    else:
        continue
df_climate.drop(['From date', 'To date'], axis=1, inplace=True)
df_climate['date'] = df_climate['date'].astype(str)
df_climate['time'] = df_climate['time'].astype(str).apply(lambda x: x.zfill(4))
df_climate['datetime'] = pd.to_datetime(df_climate['date'] + df_climate['time'], format='%Y%m%d%H%M')
df_climate.set_index('datetime', inplace=True)

100%|████████████████████████████████████████| 13/13 [2:03:59<00:00, 572.30s/it]


In [41]:
df_climate.to_csv("df_climate.csv")

In [45]:
df_climate.head(3)

Unnamed: 0_level_0,Location name,ESMI_ID,District,State,Category,Connection Type,Lat,Lon,t2m,u10,v10,tp,date,time,station_id
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-11-15 10:00:00,Tiwaripur (E),,Sultanpur,Uttar Pradesh,Gram Panchayat,Domestic,26.178897,82.265947,298.70874,-0.164597,-0.954286,0.0,20131115,1000,526
2013-12-09 02:00:00,Tiwaripur (E),,Sultanpur,Uttar Pradesh,Gram Panchayat,Domestic,26.178897,82.265947,284.7827,1.009763,-0.009255,0.0,20131209,200,526
2013-07-14 04:00:00,Tiwaripur (E),,Sultanpur,Uttar Pradesh,Gram Panchayat,Domestic,26.178897,82.265947,302.07202,-1.178657,0.284906,0.000858,20130714,400,526


# Process hourly voltage data into one csv file

In [57]:
input_dir = "/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/final_verification/india_processing/india_hourly"
files = os.listdir(input_dir)
df_blackout = pd.DataFrame()

for file in tqdm(files):
    if file.endswith('.csv'):
        df = pd.read_csv(input_dir + '/' + file)
        df['station_id'] = file.split('_')[-1].split('.')[0]
        df_blackout = pd.concat([df_blackout, df])
    else:
        continue

100%|█████████████████████████████████████████| 536/536 [01:51<00:00,  4.79it/s]


In [58]:
df_blackout['hour'] = pd.to_datetime(df_blackout['hour'], format='%Y-%m-%d %H:%M:%S')
df_blackout.set_index('hour', inplace=True)

In [63]:
df_blackout.to_csv("df_blackout.csv")

# Merge Voltage dataframe with Climate dataframe

In [65]:
# First, reset the index to make the datetime index a regular column temporarily.
df_climate_reset = df_climate.reset_index()
df_blackout_reset = df_blackout.reset_index()

Unnamed: 0,hour,pct_blackout,station_id
0,2015-10-08 00:00:00,0.0,103
1,2015-10-08 01:00:00,0.0,103
2,2015-10-08 02:00:00,0.0,103
3,2015-10-08 03:00:00,0.95,103
4,2015-10-08 04:00:00,1.0,103


In [71]:
df_blackout_reset.rename(columns={"hour": "datetime"}, inplace=True)

In [75]:
# Merge based on 'station_id' and 'datetime' (which was previously the index).
df_merged_outer = pd.merge(df_climate_reset, df_blackout_reset, on=['station_id', 'datetime'], how='outer')

In [76]:
df_merged_inner = pd.merge(df_climate_reset, df_blackout_reset, on=['station_id', 'datetime'], how='inner')

In [89]:
df_merged_outer.to_csv("df_merged_outer.csv")

In [90]:
df_merged_inner.to_csv("df_merged_inner.csv")