In [1]:
import re
import itertools
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
def read_all_links(url):
    mp = requests.get(url).content
    soup = BeautifulSoup(mp, 'html.parser')
    raw_links = soup.find_all('a', href=True)
    all_links = [{'title': lnk.string, 'url': lnk.get('href')} for lnk in raw_links]
    # everything I want is "<Month> <Year>" so split title should be length 2
    all_links = [lnk for lnk in all_links if len(str.split(str(lnk['title']))) == 2]
    # Now I can just make sure the second element is a digit and that should get just
    # The year month ones instead of "Earth Sciences" for example
    all_links = [lnk for lnk in all_links if str.split(lnk['title'])[1].isdigit()]
    # 2014 has 4 Months of a discontinued series, easiest to remove them manually I think
    bad_urls = [
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/15894',
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/15707',
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/18535',
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/14456'
    ]
    all_links = [lnk for lnk in all_links if lnk['url'] not in bad_urls]
    return all_links

In [3]:
def read_all_pages():
    urls = [
        'https://www.nrcan.gc.ca/energy/oil-sands/18087', # main page
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/18122', # 2015
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/16993' # 2014
        # Can add more here if NRCAN fixes their archive links or I find where they're archived
    ]
    page_lists = [read_all_links(url) for url in urls]
    combined = list(itertools.chain.from_iterable(page_lists))
    return combined

In [4]:
links = read_all_pages()

In [5]:
year_dict = {}
for year in range(2014, 2019):
    year_dict[year] = [lnk for lnk in links if lnk['title'].endswith(str(year))]

def normalize_cols(df):
    df = df.copy()
    df.columns = ['_'.join(col.split()).lower() for col in df.columns]
    return df

In [6]:
eg_url = year_dict[2018][8]['url']
eg_url
def df_2018(link):
    df = (
        pd.read_html(link, header=0)[0]
        .query('Date != "Average"')
        .assign(Date=lambda df: pd.to_datetime(df['Date']))
        .set_index('Date')
        .sort_index()
        .pipe(normalize_cols)
        .apply(pd.to_numeric, errors='coerce')
    )
    df = df.reindex(sorted(df.columns), axis=1)
    return df
df = df_2018(eg_url)
df.head()

Unnamed: 0_level_0,"brent_sullom_voe,_uk",cdn_light_sweet_edmonton,exchange_rate,implied_bitumen_hardisty,synthetic_edmonton,western_canada_select_hardisty,wti_cushing
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-02-01,534.0,454.0,1.2288,175.0,489.0,273.0,509.0
2018-02-02,534.0,456.0,1.238,173.0,494.0,272.0,510.0
2018-02-03,,,,,,,
2018-02-04,,,,,,,
2018-02-05,531.0,452.0,1.2483,163.0,489.0,262.0,504.0


In [7]:
dfs_2018 = [df_2018(yd['url']) for yd in year_dict[2018]]

In [8]:
set(list(itertools.chain.from_iterable([list(df.columns) for df in dfs_2018])))

{'brent_sullom_voe,_uk',
 'cdn_light_sweet_edmonton',
 'exchange_rate',
 'implied_bitumen_hardisty',
 'mixed_sweet_blend_edmonton',
 'synthetic_edmonton',
 'western_canada_select_hardisty',
 'wti_cushing'}

In [9]:
dfs_2018_concat = pd.concat(dfs_2018, sort=True).sort_index()
print(dfs_2018_concat.shape)
dfs_2018_concat.tail()

(299, 8)


Unnamed: 0_level_0,"brent_sullom_voe,_uk",cdn_light_sweet_edmonton,exchange_rate,implied_bitumen_hardisty,mixed_sweet_blend_edmonton,synthetic_edmonton,western_canada_select_hardisty,wti_cushing
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-10-27,,,,,,,,
2018-10-28,,,,,,,,
2018-10-29,607.0,,1.3067,104.0,304.0,321.0,206.0,551.0
2018-10-30,589.0,,1.2998,117.0,271.0,292.0,210.0,541.0
2018-10-31,590.0,,1.2998,82.0,251.0,285.0,182.0,534.0


In [10]:
eg_url = year_dict[2017][8]['url']

def df_2017(link):
    df = (
        pd.read_html(link, header=0)[0]
        .query('Date != "Average"')
        .assign(Date=lambda df: pd.to_datetime(df['Date']))
        .set_index('Date')
        .sort_index()
        .pipe(normalize_cols)
        .apply(pd.to_numeric, errors='coerce')
    )
    df = df.reindex(sorted(df.columns), axis=1)
    return df
df = df_2017(eg_url)
df.head()

Unnamed: 0_level_0,brent_montreal,cdn_light_sweet_chicago,cdn_light_sweet_edmonton,exchange_rate,western_canada_select_chicago,western_canada_select_hardisty,wti_chicago
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-04-01,,,,,,,
2017-04-02,,,,,,,
2017-04-03,461.0,456.0,417.0,1.3384,378.0,335.0,439.0
2017-04-04,474.0,453.0,415.0,1.3425,388.0,345.0,447.0
2017-04-05,475.0,461.0,422.0,1.3409,392.0,349.0,447.0
