In [1]:
## --- script to scrap data from BEA ----------------------------------

# import library
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
# auxiliary function to create "quarter names" for url using numpy
def q_names(startyear=2005,endyear=2015):
    years = np.repeat(list(range(startyear,endyear+1)), 4, axis=0)
    # 2005, 2005, 2005, 2005, 2006 and so on
    quarters = np.array((["Q1","Q2","Q3","Q4"] * (endyear-startyear+1)))
    # Q1, Q2, Q3, Q4, Q1 and so on  
    names = np.core.defchararray.add(years.astype("str"), quarters)
    # element wise string "addition"
    return(names)

In [3]:
# example
list(q_names(2005, 2005))

['2005Q1', '2005Q2', '2005Q3', '2005Q4']

In [None]:
# auxiliary function to get the urls for the gdp data
def finishurl(quarter):
    url = "http://www.bea.gov/iTable/drilldown.cfm?reqid=70&stepnum=11&AreaTypeKeyGdp=5&GeoFipsGdp=XX&ClassKeyGdp=NAICS&ComponentKey=200&IndustryKey=1&YearGdp="
    url += quarter
    url += "&YearGdpBegin=-1&YearGdpEnd=-1&UnitOfMeasureKeyGdp=Levels&RankKeyGdp=1&Drill=1&nRange=5"
    return(url)

In [None]:
# creating empty dataframe and adding columns was painful with data types and some other issues
# using very first columns (with index) as a root structure and adding columns is much doable 
# "do-while" loop like structure instead of initializing empty dataframe and adding rows and reorder rows and columns again

def bea_scrap(startyear, endyear, default_name = True):
    # quarters to loop over
    quarters = q_names(startyear, endyear)
    
    # do-while loop
    url = finishurl(quarters[0])
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content , "html.parser")
    tags = soup.find_all("tr")
    state, num = [], []
    for i in range(2,53): # remove US total
        # get state name
        state.append(tags[i].contents[1].string) 
        # get state gdp
        num.append(int(tags[i].contents[2].string.replace(",", "")))
        pass
    
    # initialize pandas dataframe for remaining columns to be added to
    df = pd.DataFrame(num,state, columns=["num"])
    df.sort_index(ascending=True, inplace=True) # sort
    
    # loop over remaining quarters
    for quarter in quarters[1:]:
        url = finishurl(quarter)
        resp = requests.get(url)
        soup = BeautifulSoup(resp.content , "html.parser")
        tags = soup.find_all("tr")
        state, num = [], []
        for i in range(2,53): # remove US total
            state.append(tags[i].contents[1].string) # get state name
            num.append(int(tags[i].contents[2].string.replace(",", ""))) # get state gdp
            pass
        df2 = pd.DataFrame(num, state, columns=["num"])
        df2.sort_index(ascending=True, inplace=True)
        
        # add columns
        df = pd.merge(df, df2, left_index = True, right_index=True, how="outer")
        pass
    
    # change columns names to 2005Q1 to 2005.0, 2005Q2 to 2005,25 for easy plotting later
    df.columns = [x.replace(x[4:6],  str( (int(x[5])-1)/4 )[1:] ) for x in q_names(startyear, endyear)]
    
    ## doesn't not save index name ?
    # colnames = list(df.columns)
    # colnames[0] = "state"
    # df.columns = colnames
    # df.set_index("state", inplace=True)
    
    if default_name == True:
        filename = str(startyear) + "_" + str(endyear) + "_quarterly_gdp.csv"
    else:
        filename = "quarterly_gdp.csv"
        
    df.to_csv(filename, index_label = "state")

In [None]:
# run function to scrap and save the data as 2005_2006_quarterly_gdp.csv
bea_scrap(2005, 2006, True)

In [4]:
pd.read_csv("2005_2006_quarterly_gdp.csv",index_col="state").head(5)

Unnamed: 0_level_0,2005.0,2005.25,2005.5,2005.75,2006.0,2006.25,2006.5,2006.75
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,154589,157060,158533,161473,162683,164718,166121,167003
Alaska,37797,39175,40901,43235,43084,44849,45519,46019
Arizona,218140,224181,231192,234675,241447,244356,250499,255996
Arkansas,88042,89049,90418,92940,94175,96047,96336,96349
California,1715019,1741361,1781265,1804388,1846592,1855351,1876489,1897445


In [None]:
# run function to scrap and save the data as quarterly_gdp.csv
bea_scrap(2005, 2015, False)

In [5]:
pd.read_csv("quarterly_gdp.csv",index_col="state").head(5)

Unnamed: 0_level_0,2005.0,2005.25,2005.5,2005.75,2006.0,2006.25,2006.5,2006.75,2007.0,2007.25,...,2013.5,2013.75,2014.0,2014.25,2014.5,2014.75,2015.0,2015.25,2015.5,2015.75
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,154589,157060,158533,161473,162683,164718,166121,167003,167471,169730,...,192492,193387,193223,196565,200215,200135,202486,202821,205292,206341
Alaska,37797,39175,40901,43235,43084,44849,45519,46019,46897,49160,...,60396,58849,59040,59078,57776,56373,53209,54203,52244,51560
Arizona,218140,224181,231192,234675,241447,244356,250499,255996,257950,261885,...,271592,275806,277139,280117,284216,284763,286401,290311,291429,294170
Arkansas,88042,89049,90418,92940,94175,96047,96336,96349,95956,97869,...,117378,118215,118742,120967,121792,122758,120847,122817,124370,124792
California,1715019,1741361,1781265,1804388,1846592,1855351,1876489,1897445,1908009,1936751,...,2223967,2292221,2274103,2312827,2352770,2360282,2403342,2459749,2474341,2496710


In [None]:
## --- function to aggregate quarterly data to annual data ----------------------------------------------

def annual_csv(endyear=2015):
    # read relevant quartery csv file
    df = pd.read_csv("quarterly_gdp.csv")
    df = df[df.columns[0:(45-4*(2015-endyear))]]
    
    # sum every 4 quarters to get annual data
    mat = np.sum(df[df.columns[1:5]],axis=1) # first year (imagine do-while loop)
    for i in range(5,44,4): # for remaining years
        annual = np.sum(df[df.columns[i:i+4]],axis=1) # annual data by summing 4 quarters
        mat = np.vstack((mat,annual)) # row wise addition / row bind
        pass
    
    # convert transpose of data matrix to pandas dataframe
    # looped over years so data matrix dimensions are 10 by 50 so need to take transpose for state as index
    df_annual = pd.DataFrame(mat.T)
    
    # add new column of state names to be used as an index
    df_annual = pd.concat([df_annual, pd.Series(df["state"])], axis=1) 
    
    # new column names
    colnames = list(range(2005,2016))
    colnames.append("state") 
    df_annual.columns = colnames
    
    df_annual.set_index("state", inplace = True) # set state as index
    
    df_annual.to_csv("annual_gdp.csv")

In [None]:
# aggregate quarterly data to annual data
annual_csv(2015)

In [6]:
pd.read_csv("annual_gdp.csv", index_col="state").head(5)

Unnamed: 0_level_0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,631655,660525,681597,694082,678345,704883,727692,749133,766421,790138,816940
Alaska,161108,179471,197141,221744,202055,216880,237273,246454,239563,232267,211216
Arizona,908188,992298,1048854,1035910,972408,989332,1022482,1064523,1084290,1126235,1162311
Arkansas,360449,382907,396841,408401,400391,419715,437513,446164,466606,484259,492826
California,7042033,7475877,7807989,7974356,7654698,7843740,8125391,8486409,8860926,9299982,9834142
