In [2]:
# here, we combine energy_use.csv and energy_production.csv into the proper timer series format and output energy_final.csv

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df_use = pd.read_csv('../../SharedData/dataset-generation/energy_use.csv')
df_prod = pd.read_csv('../../SharedData/dataset-generation/energy_production.csv')

d = {'state':[], 'date':[], 'monthly_energy_prod':[], 'monthly_energy_use':[], 'yearly_energy_prod':[], 'yearly_energy_use':[]}
new_df = pd.DataFrame(data=d)

In [3]:
abbreviations = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]

In [4]:
df_use = df_use[df_use['State'].isin(abbreviations)]
df_prod = df_prod[df_prod['State'].isin(abbreviations)]

In [5]:
df_use.head()

Unnamed: 0,State,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AK,59303,70020,76642,78488,82793,85319,100481,112625,119992,...,587460,586053,602906,582922,590212,593292,595231,629029,688282,724059
1,AL,842283,806682,853424,885811,949411,1011125,1055347,1100904,1208038,...,1882362,1929314,1880544,1895744,1867484,1913464,1883941,1793090,1924516,1902374
2,AR,417153,423192,446994,474761,515874,515759,553146,581814,620382,...,1092848,1108345,1043455,1036998,1043543,1103282,1078044,1010754,1054467,1052517
3,AZ,273431,294660,313581,337213,357961,362858,390820,400280,442172,...,1419110,1422839,1439724,1462217,1462699,1474472,1493085,1435329,1481293,1526882
4,CA,3360697,3513822,3623382,3825677,4112125,4267121,4525141,4703411,4979463,...,7162622,7073527,7169881,7168226,7176823,7317326,7258664,6462830,6808107,6882442


In [6]:
# loop through each year in 1960-2022 in given row in dataframe
# copy each year's energy data 12 times (one for each month) into 12 rows of new_df
# for a total of 12*63*50 entries

month_counter = 1   # acts like modulo 12 arithmetic
idx = 0             # keeps track of row index

for state in range(50):
    for year in range(1,len(df_prod.iloc[state])):
        for month in range(12):
            if month_counter < 12:   
                new_df.loc[idx] = [df_prod.iloc[state][0],
                                   df_prod.columns[year]+'-'+'{:02d}'.format(month_counter),
                                   None,
                                   None,
                                   df_prod.iloc[state][year], 
                                   df_use.iloc[state][year]]     
                month_counter += 1
            else:
                new_df.loc[idx] = [df_prod.iloc[state][0],
                    df_prod.columns[year]+'-'+'{:02d}'.format(month_counter),
                    int(df_prod.iloc[state][year].replace(',', ''))/12, 
                    int(df_use.iloc[state][year].replace(',', ''))/12,
                    df_prod.iloc[state][year], 
                    df_use.iloc[state][year]]     
                month_counter = 1
            idx += 1

In [7]:
# clean up data by removing commas and converting to int

new_df['yearly_energy_use'] = new_df['yearly_energy_use'].str.replace(',', '')
new_df['yearly_energy_use'] = new_df['yearly_energy_use'].astype(int)

new_df['yearly_energy_prod'] = new_df['yearly_energy_prod'].str.replace(',', '')
new_df['yearly_energy_prod'] = new_df['yearly_energy_prod'].astype(int)

In [8]:
# linear interpolation to fill in months based on end-of-year data
# and create yearly_gdp_rel_2017 column

for state in abbreviations:
    new_df.loc[new_df.state == state,'monthly_energy_prod']=new_df[new_df.state == state].monthly_energy_prod.interpolate(method='linear')
    new_df.loc[new_df.state == state,'monthly_energy_use']=new_df[new_df.state == state].monthly_energy_use.interpolate(method='linear')


In [9]:
new_df['monthly_energy_flow'] = new_df['monthly_energy_prod']-new_df['monthly_energy_use']
new_df['yearly_energy_flow'] = new_df['yearly_energy_prod']-new_df['yearly_energy_use']

In [10]:
# export new_df to a .csv
new_df.to_csv('../../SharedData/dataset-generation-final/energy_final.csv',index=False)