In [1]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup as bs

In [2]:
###### use this to get current list of links:
# req = requests.get('https://download.bls.gov/pub/time.series/cu/')
# soup = bs(req.text)

# links = []
# for link in soup.findAll('a'):
#     print(link.get('href'))

list_of_links = [
# '/pub/time.series/cu/cu.data.1.AllItems',
'/pub/time.series/cu/cu.data.11.USFoodBeverage',
'/pub/time.series/cu/cu.data.12.USHousing',
'/pub/time.series/cu/cu.data.13.USApparel',
'/pub/time.series/cu/cu.data.14.USTransportation',
'/pub/time.series/cu/cu.data.15.USMedical',
'/pub/time.series/cu/cu.data.16.USRecreation',
'/pub/time.series/cu/cu.data.17.USEducationAndCommunication',
'/pub/time.series/cu/cu.data.18.USOtherGoodsAndServices',
'/pub/time.series/cu/cu.data.20.USCommoditiesServicesSpecial',
]

all_items = pd.DataFrame()

for link in list_of_links:
    #all timeseries data from bls.
    sub_dataset = pd.read_csv(f'https://download.bls.gov{link}', sep='\t')
    sub_dataset = sub_dataset.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    sub_dataset.columns = [x.strip() for x in sub_dataset.columns]

    all_items = pd.concat([all_items, sub_dataset], axis=0)

In [3]:
#series description or meta information data from bls.
series_descriptions = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.series', sep='\t')
series_descriptions = series_descriptions.applymap(lambda x: x.strip() if isinstance(x, str) else x)
series_descriptions.columns = [x.strip() for x in series_descriptions.columns]

#period description from bls.
period_description = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.period', sep='\t')
period_description = period_description.applymap(lambda x: x.strip() if isinstance(x, str) else x)
period_description.columns = [x.strip() for x in period_description.columns]

# periodicity desciption from BLS, monthly, or other. We only care about monthly.
periodicity_description = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.periodicity', sep='\t')
periodicity_description = periodicity_description.applymap(lambda x: x.strip() if isinstance(x, str) else x)
periodicity_description.columns = [x.strip() for x in periodicity_description.columns]

# seasonallity description from BLS, if data is seasonally adjusted or not.
seasonal_description = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.seasonal', sep='\t')
seasonal_description = seasonal_description.applymap(lambda x: x.strip() if isinstance(x, str) else x)
seasonal_description.columns = [x.strip() for x in seasonal_description.columns]

In [4]:
#enhance timeseries data with meta data located in other files.
all_items_enhanced = all_items.merge(series_descriptions, how='left',  left_on='series_id', right_on='series_id')
all_items_enhanced = all_items_enhanced.merge(period_description, how='left',  left_on='period', right_on='period')
all_items_enhanced = all_items_enhanced.merge(periodicity_description, how='left',  left_on='periodicity_code', right_on='periodicity_code')
all_items_enhanced = all_items_enhanced.merge(seasonal_description, how='left',  left_on='seasonal', right_on='seasonal_code')

In [5]:
# 1. filter to only the monthly data. 
# 2. add a datetime column
# 3. sort rows by dates. 
# 4. set multi-index to series_id and date for easy access. 

periods_wanted = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]
all_items_enhanced = all_items_enhanced[ all_items_enhanced.period_name.isin(periods_wanted)].reset_index()
all_items_enhanced['date'] = pd.to_datetime(all_items_enhanced['period'].str[-2:] + '/' +  all_items_enhanced['year'].astype(str), format='%m/%Y') + pd.offsets.MonthEnd(0)
all_items_enhanced = all_items_enhanced.sort_values('date')
all_items_enhanced = all_items_enhanced.set_index(['series_id', 'date'])

all_items_enhanced.to_pickle('datastore/processed_data/all_items_enhanced.pickle')
all_items_enhanced.to_csv('datastore/processed_data/all_items_enhanced.csv')
# series_descriptions.to_csv('datastore/processed_data/series_descriptions.csv')