In [1]:
# here, we convert gdp-1997-2023.csv into the proper timer series format and output gdp-1997-2023-final.csv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../../SharedData/dataset-generation/gdp-1997-2023.csv')

d = {'state':[], 'date':[], 'monthly_gdp_rel_2017':[], 'yearly_gdp_rel_2017':[], 'interp_col':[]}
new_df = pd.DataFrame(data=d)
new_new_df = pd.DataFrame(data=d)

In [3]:
state_names = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
    "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland",
    "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey",
    "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina",
    "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

abbreviations = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]
name_dict = {state_names[i]:abbreviations[i] for i in range(len(state_names))}

In [4]:
df = df[df.LineCode==1.0]
df = df[df.GeoName.isin(state_names)]
df = df.drop(['GeoFIPS','Region','IndustryClassification','TableName','Description','Unit'],axis=1)
df = df.reset_index(drop=True)

In [5]:
for i in range(50):    
    for year in range(1997,2024):
        new_df.loc[len(new_df)+1] = None
        new_df = new_df.astype('object')
        new_df.loc[len(new_df)]['state']=df.iloc[i]['GeoName']
        new_df.loc[len(new_df)]['date']=str(year)
        new_df.loc[len(new_df)]['interp_col']=df.iloc[i][str(year)]

new_df['state'] = new_df['state'].map(name_dict)

In [6]:
# loop through each of 27 years in given row in dataframe
# copy each year's GDP 12 times (one for each month) into 12 rows of new_new_df
# for a total of 12*27*50 entries

idx = 0             # keeps track of row index
for state in range(50):
    for year in range(1997,2024):
        for month in range(1,13):
            if month == 1:      
                new_new_df.loc[idx*12+month-1] = [new_df.iloc[state*27][0], 
                                                str(year)+'-'+'{:02d}'.format(month), 
                                                None,
                                                new_df.interp_col.iloc[idx],
                                                new_df.interp_col.iloc[idx]/12]  
            else:
                new_new_df.loc[idx*12+month-1] = [new_df.iloc[state*27][0], 
                                                str(year)+'-'+'{:02d}'.format(month),
                                                None,
                                                new_df.interp_col.iloc[idx],
                                                None]
        idx += 1

In [7]:
# The above dataframe construction is nowhere near efficient, but it works. Here are some quick-fix things to make it work.

new_new_df.date = new_new_df.date.apply(pd.to_datetime)
new_new_df.date = new_new_df.date.dt.strftime('%Y-%m')
new_new_df = new_new_df.sort_values(['state','date']).reset_index(drop=True)

In [8]:
# linear interpolation to fill in months based on end-of-year data
# and create monthly_gdp_rel_2017 column

for state in abbreviations:
    new_new_df.loc[new_new_df.state == state,'monthly_gdp_rel_2017'] = new_new_df[new_new_df.state == state].interp_col.interpolate(method='linear')


In [None]:
# print(new_new_df.yearly_gdp_rel_2017.iloc[[0,12,24,36,48]])
# print(np.sum(new_new_df.monthly_gdp_rel_2017[0:11]))
# print(np.sum(new_new_df.monthly_gdp_rel_2017[12:23]))
# print(np.sum(new_new_df.monthly_gdp_rel_2017[24:35]))
# print(np.sum(new_new_df.monthly_gdp_rel_2017[36:47]))
# print(np.sum(new_new_df.monthly_gdp_rel_2017[48:59]))

0     41071.0
12    40263.7
24    39783.1
36    38428.1
48    40014.4
Name: yearly_gdp_rel_2017, dtype: float64
37340.072916666664
36724.82916666666
35950.30694444444
35831.63680555556
37401.856250000004


In [9]:
new_new_df.drop(columns='interp_col', inplace=True)

In [10]:
new_new_df

Unnamed: 0,state,date,monthly_gdp_rel_2017,yearly_gdp_rel_2017
0,AK,1997-01,3422.583333,41071.0
1,AK,1997-02,3416.977083,41071.0
2,AK,1997-03,3411.370833,41071.0
3,AK,1997-04,3405.764583,41071.0
4,AK,1997-05,3400.158333,41071.0
...,...,...,...,...
16195,WY,2023-08,3350.9,40210.8
16196,WY,2023-09,3350.9,40210.8
16197,WY,2023-10,3350.9,40210.8
16198,WY,2023-11,3350.9,40210.8


In [11]:
# export new_new_df to a .csv

new_new_df.to_csv('../../SharedData/dataset-generation-final/gdp-1997-2023-final.csv', index=False)