In [1]:

import os
import pandas as pd


In [127]:

def read_climate_data(input_dir):
	folders = os.listdir(input_dir)
	df_climate = pd.DataFrame()

	for folder in folders:
		files = os.listdir(input_dir + '/' + folder)

		for file in files:
			df = pd.read_csv(input_dir + '/' + folder + '/' + file)

			df['station_id'] = file.split('_')[1]

			df_climate = pd.concat([df_climate, df])

	df_climate.drop(['From date', 'To date'], axis=1, inplace=True)
	df_climate['date'] = df_climate['date'].astype(str)
	df_climate['time'] = df_climate['time'].astype(str).apply(lambda x: x.zfill(4))
	df_climate['datetime'] = pd.to_datetime(df_climate['date'] + df_climate['time'], format='%Y%m%d%H%M')
	df_climate.set_index('datetime', inplace=True)



	print('Finished reading data...info below:')
	print(df_climate.info())

	print('index:', df_climate.index)
	return df_climate

In [128]:

def read_blackout_data(input_dir):
	files = os.listdir(input_dir)
	print(files)
	df_blackout = pd.DataFrame()

	for file in files:
		df = pd.read_csv(input_dir + '/' + file)
		df['station_id'] = file.split('_')[-1].split('.')[0]
		df_blackout = pd.concat([df_blackout, df])

	df_blackout['hour'] = pd.to_datetime(
		df_blackout['hour'], format='%Y-%m-%d %H:%M:%S')
	
	# rename hour to datetime
	df_blackout.rename(columns={'hour': 'datetime'}, inplace=True)

	df_blackout.set_index('datetime', inplace=True)
	print('Finished reading data...info below:')
	print(df_blackout.info())
	print('index:', df_blackout.index)
	return df_blackout

In [129]:
df_climate = read_climate_data('../../data/station_climate_by_year')

# print columns
print(df_climate.columns)

Finished reading data...info below:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 22296 entries, 2013-11-15 10:00:00 to 2014-03-18 04:00:00
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       22296 non-null  int64  
 1   Location name    22296 non-null  object 
 2   ESMI_ID          0 non-null      float64
 3   District         22296 non-null  object 
 4   State            22296 non-null  object 
 5   Category         22296 non-null  object 
 6   Connection Type  22296 non-null  object 
 7   Lat              22296 non-null  float64
 8   Lon              22296 non-null  float64
 9   t2m              22296 non-null  float64
 10  u10              22296 non-null  float64
 11  v10              22296 non-null  float64
 12  tp               22296 non-null  float64
 13  date             22296 non-null  object 
 14  time             22296 non-null  object 
 15  station_id       22296 non-null  

In [130]:
df_blackout = read_blackout_data('../../data/india_hourly')

# print(df_blackout['station_id'].unique())
# print columns name

print('columns:',df_blackout.columns)

['hourly_station_217.csv', 'hourly_station_563.csv', 'hourly_station_445.csv']
Finished reading data...info below:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 34200 entries, 2014-11-24 13:00:00 to 2018-10-01 08:00:00
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pct_blackout  34200 non-null  float64
 1   station_id    34200 non-null  object 
dtypes: float64(1), object(1)
memory usage: 801.6+ KB
None
index: DatetimeIndex(['2014-11-24 13:00:00', '2014-11-24 14:00:00',
               '2014-11-24 15:00:00', '2014-11-24 16:00:00',
               '2014-11-24 17:00:00', '2014-11-24 18:00:00',
               '2014-11-24 19:00:00', '2014-11-24 20:00:00',
               '2014-11-24 21:00:00', '2014-11-24 22:00:00',
               ...
               '2018-08-22 22:00:00', '2018-08-22 23:00:00',
               '2018-08-23 00:00:00', '2018-08-23 01:00:00',
               '2018-08-23 02:00:00', '2018-08-23 03:00:00

In [131]:
print(df_climate.index)
print(df_blackout.index)

DatetimeIndex(['2013-11-15 10:00:00', '2013-12-09 02:00:00',
               '2013-07-14 04:00:00', '2013-11-13 18:00:00',
               '2013-09-14 03:00:00', '2013-07-10 14:00:00',
               '2013-10-30 12:00:00', '2013-11-11 00:00:00',
               '2013-08-09 09:00:00', '2013-11-17 08:00:00',
               ...
               '2014-06-26 23:00:00', '2014-11-27 18:00:00',
               '2014-02-07 16:00:00', '2014-08-01 11:00:00',
               '2014-04-25 23:00:00', '2014-01-25 04:00:00',
               '2014-11-19 18:00:00', '2014-09-20 03:00:00',
               '2014-06-18 23:00:00', '2014-03-18 04:00:00'],
              dtype='datetime64[ns]', name='datetime', length=22296, freq=None)
DatetimeIndex(['2014-11-24 13:00:00', '2014-11-24 14:00:00',
               '2014-11-24 15:00:00', '2014-11-24 16:00:00',
               '2014-11-24 17:00:00', '2014-11-24 18:00:00',
               '2014-11-24 19:00:00', '2014-11-24 20:00:00',
               '2014-11-24 21:00:00', '2014-11

In [132]:
# First, reset the index to make the datetime index a regular column temporarily.
df_climate_reset = df_climate.reset_index()
df_blackout_reset = df_blackout.reset_index()

df_blackout_reset.head()

Unnamed: 0,datetime,pct_blackout,station_id
0,2014-11-24 13:00:00,0.0,217
1,2014-11-24 14:00:00,0.0,217
2,2014-11-24 15:00:00,0.066667,217
3,2014-11-24 16:00:00,0.066667,217
4,2014-11-24 17:00:00,0.016667,217


In [133]:

# Now, group by 'station_id' on both dataframes.
# This step isn't strictly necessary for the merge but helps in understanding how data aligns per station.
climate_grouped = df_climate_reset.groupby('station_id')
blackout_grouped = df_blackout_reset.groupby('station_id')

climate_grouped.head()

Unnamed: 0.1,datetime,Unnamed: 0,Location name,ESMI_ID,District,State,Category,Connection Type,Lat,Lon,t2m,u10,v10,tp,date,time,station_id
0,2013-11-15 10:00:00,0,Kalpataru Nagar,,Nashik,Maharashtra,District Headquarters,Domestic,19.977226,73.798872,300.76343,-2.888229,-1.29535,0.0,20131115,1000,217
1,2013-12-09 02:00:00,1,Kalpataru Nagar,,Nashik,Maharashtra,District Headquarters,Domestic,19.977226,73.798872,285.26416,-1.690433,0.210471,0.0,20131209,200,217
2,2013-07-14 04:00:00,2,Kalpataru Nagar,,Nashik,Maharashtra,District Headquarters,Domestic,19.977226,73.798872,295.84448,4.12603,1.337641,0.001095867,20130714,400,217
3,2013-11-13 18:00:00,3,Kalpataru Nagar,,Nashik,Maharashtra,District Headquarters,Domestic,19.977226,73.798872,293.23718,-1.077642,-0.137814,1.724326e-06,20131113,1800,217
4,2013-09-14 03:00:00,4,Kalpataru Nagar,,Nashik,Maharashtra,District Headquarters,Domestic,19.977226,73.798872,297.41895,-0.517675,0.295947,3.697351e-07,20130914,300,217
4776,2014-07-16 03:00:00,0,Sikandarabad,,Sitapur,Uttar Pradesh,Gram Panchayat,Domestic,27.954139,80.491162,301.31006,-5.47834,2.275894,6.176531e-06,20140716,300,445
4777,2014-11-17 17:00:00,1,Sikandarabad,,Sitapur,Uttar Pradesh,Gram Panchayat,Domestic,27.954139,80.491162,288.14508,2.081586,-0.421135,1.706016e-06,20141117,1700,445
4778,2014-05-15 03:00:00,2,Sikandarabad,,Sitapur,Uttar Pradesh,Gram Panchayat,Domestic,27.954139,80.491162,304.364,4.959698,-2.010355,0.0,20140515,300,445
4779,2014-02-09 19:00:00,3,Sikandarabad,,Sitapur,Uttar Pradesh,Gram Panchayat,Domestic,27.954139,80.491162,284.62054,3.14785,-0.851257,8.663768e-07,20140209,1900,445
4780,2014-06-09 11:00:00,4,Sikandarabad,,Sitapur,Uttar Pradesh,Gram Panchayat,Domestic,27.954139,80.491162,316.88672,4.568093,-1.199785,8.523463e-07,20140609,1100,445


In [134]:

# Merge based on 'station_id' and 'datetime' (which was previously the index).
df_merged = pd.merge(df_climate_reset, df_blackout_reset, on=[
                     'station_id', 'datetime'], how='outer')


In [135]:
df_merged.to_csv('merged_data1.csv')
df_merged.head()


Unnamed: 0.1,datetime,Unnamed: 0,Location name,ESMI_ID,District,State,Category,Connection Type,Lat,Lon,t2m,u10,v10,tp,date,time,station_id,pct_blackout
0,2013-11-15 10:00:00,0.0,Kalpataru Nagar,,Nashik,Maharashtra,District Headquarters,Domestic,19.977226,73.798872,300.76343,-2.888229,-1.29535,0.0,20131115,1000,217,
1,2013-12-09 02:00:00,1.0,Kalpataru Nagar,,Nashik,Maharashtra,District Headquarters,Domestic,19.977226,73.798872,285.26416,-1.690433,0.210471,0.0,20131209,200,217,
2,2013-07-14 04:00:00,2.0,Kalpataru Nagar,,Nashik,Maharashtra,District Headquarters,Domestic,19.977226,73.798872,295.84448,4.12603,1.337641,0.001095867,20130714,400,217,
3,2013-11-13 18:00:00,3.0,Kalpataru Nagar,,Nashik,Maharashtra,District Headquarters,Domestic,19.977226,73.798872,293.23718,-1.077642,-0.137814,1.724326e-06,20131113,1800,217,
4,2013-09-14 03:00:00,4.0,Kalpataru Nagar,,Nashik,Maharashtra,District Headquarters,Domestic,19.977226,73.798872,297.41895,-0.517675,0.295947,3.697351e-07,20130914,300,217,
