In [1]:

import os
import pandas as pd


In [50]:

def read_climate_data(input_dir):
	folders = os.listdir(input_dir)
	df_climate = pd.DataFrame()

	for folder in folders:
		files = os.listdir(input_dir + '/' + folder)

		for file in files:
			df = pd.read_csv(input_dir + '/' + folder + '/' + file)
			# add station_id which is in the file name
			df['station_id'] = file.split('_')[0]
			df_climate = pd.concat([df_climate, df])

	df_climate.drop(['From date', 'To date'], axis=1, inplace=True)
	df_climate['date'] = df_climate['date'].astype(str)
	df_climate['time'] = df_climate['time'].astype(str).apply(lambda x: x.zfill(4))
	df_climate['datetime'] = pd.to_datetime(df_climate['date'] + df_climate['time'], format='%Y%m%d%H%M')
	df_climate.set_index('datetime', inplace=True)



	print('Finished reading data...info below:')
	print(df_climate.info())

	print('index:', df_climate.index)
	return df_climate

In [38]:


def read_blackout_data(input_dir):
	files = os.listdir(input_dir)
	print(files)
	df_blackout = pd.DataFrame()

	for file in files:
		df = pd.read_csv(input_dir + '/' + file)
		df['station_id'] = file.split('_')[-1].split('.')[0]
		df_blackout = pd.concat([df_blackout, df])

	df_blackout['hour'] = pd.to_datetime(
		df_blackout['hour'], format='%Y-%m-%d %H:%M:%S')

	df_blackout.set_index('hour', inplace=True)
	print('Finished reading data...info below:')
	print(df_blackout.info())
	print('index:', df_blackout.index)
	return df_blackout

In [51]:
df_climate = read_climate_data('../../data/station_climate_by_year')

# print columns
print(df_climate.columns)

Finished reading data...info below:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13536 entries, 2013-11-15 10:00:00 to 2014-03-18 04:00:00
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       13536 non-null  int64  
 1   Location name    13536 non-null  object 
 2   ESMI_ID          0 non-null      float64
 3   District         13536 non-null  object 
 4   State            13536 non-null  object 
 5   Category         13536 non-null  object 
 6   Connection Type  13536 non-null  object 
 7   Lat              13536 non-null  float64
 8   Lon              13536 non-null  float64
 9   t2m              13536 non-null  float64
 10  u10              13536 non-null  float64
 11  v10              13536 non-null  float64
 12  tp               13536 non-null  float64
 13  date             13536 non-null  object 
 14  time             13536 non-null  object 
 15  station_id       13536 non-null  

In [52]:
df_blackout = read_blackout_data('../../data/india_hourly')

# print(df_blackout['station_id'].unique())
# print columns name
print('columns:',df_blackout.columns)



['hourly_station_217.csv', 'hourly_station_563.csv', 'hourly_station_445.csv']
Finished reading data...info below:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 34200 entries, 2014-11-24 13:00:00 to 2018-10-01 08:00:00
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pct_blackout  34200 non-null  float64
 1   station_id    34200 non-null  object 
dtypes: float64(1), object(1)
memory usage: 801.6+ KB
None
index: DatetimeIndex(['2014-11-24 13:00:00', '2014-11-24 14:00:00',
               '2014-11-24 15:00:00', '2014-11-24 16:00:00',
               '2014-11-24 17:00:00', '2014-11-24 18:00:00',
               '2014-11-24 19:00:00', '2014-11-24 20:00:00',
               '2014-11-24 21:00:00', '2014-11-24 22:00:00',
               ...
               '2018-08-22 22:00:00', '2018-08-22 23:00:00',
               '2018-08-23 00:00:00', '2018-08-23 01:00:00',
               '2018-08-23 02:00:00', '2018-08-23 03:00:00

In [53]:

print(df_climate.index)
print(df_blackout.index)


DatetimeIndex(['2013-11-15 10:00:00', '2013-12-09 02:00:00',
               '2013-07-14 04:00:00', '2013-11-13 18:00:00',
               '2013-09-14 03:00:00', '2013-07-10 14:00:00',
               '2013-10-30 12:00:00', '2013-11-11 00:00:00',
               '2013-08-09 09:00:00', '2013-11-17 08:00:00',
               ...
               '2014-06-26 23:00:00', '2014-11-27 18:00:00',
               '2014-02-07 16:00:00', '2014-08-01 11:00:00',
               '2014-04-25 23:00:00', '2014-01-25 04:00:00',
               '2014-11-19 18:00:00', '2014-09-20 03:00:00',
               '2014-06-18 23:00:00', '2014-03-18 04:00:00'],
              dtype='datetime64[ns]', name='datetime', length=13536, freq=None)
DatetimeIndex(['2014-11-24 13:00:00', '2014-11-24 14:00:00',
               '2014-11-24 15:00:00', '2014-11-24 16:00:00',
               '2014-11-24 17:00:00', '2014-11-24 18:00:00',
               '2014-11-24 19:00:00', '2014-11-24 20:00:00',
               '2014-11-24 21:00:00', '2014-11

In [55]:


# merge df_climate, df_blackout with datetime and station_id: 
# in df_climate, the merge index is datetime and station_id
# in df_blackout, index is hour and station_id

df = pd.merge(df_climate, df_blackout, left_index=True, right_index=True, how='inner', suffixes=('_climate', '_blackout'))

print(df.columns)



Index(['Unnamed: 0', 'Location name', 'ESMI_ID', 'District', 'State',
       'Category', 'Connection Type', 'Lat', 'Lon', 't2m', 'u10', 'v10', 'tp',
       'date', 'time', 'station_id_climate', 'pct_blackout',
       'station_id_blackout'],
      dtype='object')


In [56]:
print(df.info())
print(df.head())

df.to_csv('../../data/merged_data.csv')

# fix: id seems not the same, need to check the data

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 655 entries, 2014-11-24 13:00:00 to 2014-12-31 23:00:00
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           655 non-null    int64  
 1   Location name        655 non-null    object 
 2   ESMI_ID              0 non-null      float64
 3   District             655 non-null    object 
 4   State                655 non-null    object 
 5   Category             655 non-null    object 
 6   Connection Type      655 non-null    object 
 7   Lat                  655 non-null    float64
 8   Lon                  655 non-null    float64
 9   t2m                  655 non-null    float64
 10  u10                  655 non-null    float64
 11  v10                  655 non-null    float64
 12  tp                   655 non-null    float64
 13  date                 655 non-null    object 
 14  time                 655 non-null    object 
 15  sta

In [45]:
# select the t2m, u10, v10, and station_id for datetime 2014-11-24 13:00:00, station Sikandarabad

print(df.loc['2014-11-24 13:00:00', ['t2m', 'u10', 'v10', 'station_id']])

# select the t2m, u10, v10, and station_id for datetime 2014-11-24 13:00:00, station Sikandarabad


t2m           292.49493
u10            2.014306
v10           -1.304936
station_id          217
Name: 2014-11-24 13:00:00, dtype: object
