In [16]:
# here, we convert gdp-1997-2023.csv into the proper timer series format and output gdp-1997-2023-final-final.csv
# gdp-1997-2023.csv should be placed in ./Data directory

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [17]:
df = pd.read_csv('./Data/gdp-1997-2023.csv')

d = {'state':[], 'date':[], 'gdp_rel_2017':[], 'yearly_gdp_rel_2017':[]}
new_df = pd.DataFrame(data=d)
new_new_df = pd.DataFrame(data=d)

In [18]:
state_names = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
    "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland",
    "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey",
    "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina",
    "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

abbreviations = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]
name_dict = {state_names[i]:abbreviations[i] for i in range(len(state_names))}

In [19]:
df = df[df.LineCode==1.0]
df = df[df.GeoName.isin(state_names)]
df = df.drop(['GeoFIPS','Region','IndustryClassification','TableName','Description','Unit'],axis=1)
df = df.reset_index(drop=True)

In [20]:
for i in range(50):    
    for year in range(1997,2024):
        new_df.loc[len(new_df)+1] = None
        new_df = new_df.astype('object')
        new_df.loc[len(new_df)]['state']=df.iloc[i]['GeoName']
        new_df.loc[len(new_df)]['date']=str(year)
        new_df.loc[len(new_df)]['gdp_rel_2017']=df.iloc[i][str(year)]

new_df['state'] = new_df['state'].map(name_dict)

In [22]:
# loop through each of 27 years in given row in dataframe
# copy each year's GDP 12 times (one for each month) into 12 rows of new_new_df
# for a total of 12*27*50 entries

month_counter = 1   # acts like modulo 12 arithmetic
idx = 0             # keeps track of row index

for state in range(50):
    for year in range(1997,2024):

        for month in range(12):
            
            if month_counter < 12:      
                new_new_df.loc[idx*12+month] = [new_df.iloc[state*27][0], 
                                                str(year)+'-'+'{:02d}'.format(month_counter), 
                                                None,
                                                new_df.gdp_rel_2017.iloc[idx]]  
                month_counter += 1
            else:
                new_new_df.loc[idx*12+month] = [new_df.iloc[state*27][0], 
                                                str(year)+'-'+'{:02d}'.format(month_counter), 
                                                new_df.gdp_rel_2017.iloc[idx]/12,
                                                new_df.gdp_rel_2017.iloc[idx]]
                month_counter = 1
        idx += 1

In [23]:
# The above dataframe construction is nowhere near efficient, but it works. Here are some quick-fix things to make it work.

new_new_df.date = new_new_df.date.apply(pd.to_datetime)
new_new_df.date = new_new_df.date.dt.strftime('%Y-%m')
new_new_df = new_new_df.sort_values(['state','date']).reset_index(drop=True)

In [24]:
# linear interpolation to fill in months based on end-of-year data
# and create yearly_gdp_rel_2017 column

for state in abbreviations:
    new_new_df.loc[new_new_df.state == state,'gdp_rel_2017_interp']=new_new_df[new_new_df.state == state].gdp_rel_2017.interpolate(method='linear')


In [25]:
# export new_new_df to a .csv

new_new_df.to_csv('./Data/gdp-1997-2023-final-final.csv')