In [9]:
# here, we combine energy_use.csv and energy_production.csv into the proper timer series format and output energy_final.csv

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df_use = pd.read_csv('../../SharedData/dataset-generation/energy-use.csv')
df_prod = pd.read_csv('../../SharedData/dataset-generation/energy-production.csv')

d = {'state':[], 'date':[], 'monthly_energy_prod':[], 'monthly_energy_use':[], 'yearly_energy_prod':[], 'yearly_energy_use':[]}
new_df = pd.DataFrame(data=d)

In [10]:
abbreviations = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]

In [11]:
df_use = df_use[df_use['State'].isin(abbreviations)]
df_prod = df_prod[df_prod['State'].isin(abbreviations)]

In [12]:
# loop through each year in 1960-2022 in given row in dataframe
# copy each year's energy data 12 times (one for each month) into 12 rows of new_df
# for a total of 12*63*50 entries

month_counter = 1   # acts like modulo 12 arithmetic
idx = 0             # keeps track of row index

for state in range(50):
    for year in range(1,len(df_prod.iloc[state])):
        for month in range(12):
            if month_counter < 12:   
                new_df.loc[idx] = [df_prod.iloc[state][0],
                                   df_prod.columns[year]+'-'+'{:02d}'.format(month_counter),
                                   None,
                                   None,
                                   df_prod.iloc[state][year], 
                                   df_use.iloc[state][year]]     
                month_counter += 1
            else:
                new_df.loc[idx] = [df_prod.iloc[state][0],
                    df_prod.columns[year]+'-'+'{:02d}'.format(month_counter),
                    int(df_prod.iloc[state][year].replace(',', ''))/12, 
                    int(df_use.iloc[state][year].replace(',', ''))/12,
                    df_prod.iloc[state][year], 
                    df_use.iloc[state][year]]     
                month_counter = 1
            idx += 1

In [13]:
# clean up data by removing commas and converting to int

new_df['yearly_energy_use'] = new_df['yearly_energy_use'].str.replace(',', '')
new_df['yearly_energy_use'] = new_df['yearly_energy_use'].astype(int)

new_df['yearly_energy_prod'] = new_df['yearly_energy_prod'].str.replace(',', '')
new_df['yearly_energy_prod'] = new_df['yearly_energy_prod'].astype(int)

In [14]:
# linear interpolation to fill in months based on end-of-year data
# and create yearly_gdp_rel_2017 column

for state in abbreviations:
    new_df.loc[new_df.state == state,'monthly_energy_prod']=new_df[new_df.state == state].monthly_energy_prod.interpolate(method='linear')
    new_df.loc[new_df.state == state,'monthly_energy_use']=new_df[new_df.state == state].monthly_energy_use.interpolate(method='linear')


In [15]:
new_df['monthly_energy_flow'] = new_df['monthly_energy_prod']-new_df['monthly_energy_use']
new_df['yearly_energy_flow'] = new_df['yearly_energy_prod']-new_df['yearly_energy_use']

In [None]:
# export new_df to a .csv
new_df.to_csv('../../SharedData/dataset-generation-final/energy-use-prod-final.csv',index=False)