In [1]:
import re
import itertools
import datetime as dt
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
def read_all_links(url):
    mp = requests.get(url).content
    soup = BeautifulSoup(mp, 'html.parser')
    raw_links = soup.find_all('a', href=True)
    all_links = [{'title': lnk.string, 'url': lnk.get('href')} for lnk in raw_links]
    # everything I want is "<Month> <Year>" so split title should be length 2
    all_links = [lnk for lnk in all_links if len(str.split(str(lnk['title']))) == 2]
    # Now I can just make sure the second element is a digit and that should get just
    # The year month ones instead of "Earth Sciences" for example
    all_links = [lnk for lnk in all_links if str.split(lnk['title'])[1].isdigit()]
    # 2014 has 4 Months of a discontinued series, easiest to remove them manually I think
    bad_urls = [
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/15894',
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/15707',
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/18535',
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/14456'
    ]
    all_links = [lnk for lnk in all_links if lnk['url'] not in bad_urls]
    return all_links

In [60]:
def read_all_pages():
    urls = [
        'https://www.nrcan.gc.ca/energy/oil-sands/18087', # main page
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/18122', # 2015
        'https://www.nrcan.gc.ca/energy/fuel-prices/crude/16993' # 2014
        # Can add more here if NRCAN fixes their archive links or I find where they're archived
    ]
    page_lists = [read_all_links(url) for url in urls]
    combined = list(itertools.chain.from_iterable(page_lists))
    combined_dict = {it['title']: it['url'] for it in combined}
    combined_dict['December 2015'] = 'https://www.nrcan.gc.ca/energy/fuel-prices/crude/17963'
    return combined_dict

In [65]:
def normalize_cols(df):
    df = df.copy()
    df.columns = ['_'.join(re.sub(r"[,\*]", '', col).split()).lower() for col in df.columns]
    df = df.reindex(sorted(df.columns), axis=1)
    return df


def reindex_dates(df):
    df = (
        df
        .copy()
        .assign(Date=lambda df: pd.to_datetime(df['Date'], errors='coerce'))    
    )
    if df.iloc[1]['Date'] == dt.datetime(2017, 6, 2):
        df.loc[0, 'Date'] = dt.datetime(2017, 6, 1)
        df = (
            df
            .dropna(subset=['Date'])
            .set_index('Date')
            .reindex(pd.date_range(
                start=df['Date'].min(),
                end=df['Date'].max(),
                freq='1D')
            )
        )
    else:
        df = df.set_index('Date').sort_index()
    return df

def read_df(link):
    badrows = ['Average', '$ Cdn/m3']
    df = (
        pd.read_html(link, header=0)[0]
        .query('Date not in @badrows')
        .pipe(reindex_dates)
        .pipe(normalize_cols)
        .apply(pd.to_numeric, errors='coerce')
    )
    return df

def read_all_dfs():
    links = read_all_pages()
    df_list = [read_df(links[key]) for key in links.keys()]
    df_fin = pd.concat(df_list, sort=True).sort_index()
    return df_fin

In [61]:
# links = read_all_pages()

In [66]:
df = read_all_dfs()
df.head()

Unnamed: 0,brent_chicago,brent_montreal,brent_sarnia,brent_sullom_voe_uk,cdn_heavy_chicago,cdn_heavy_hardisty,cdn_light_chicago,cdn_light_sweet,cdn_light_sweet_chicago,cdn_light_sweet_edmonton,...,cdn_par_edmonton,exchange_rate,implied_bitumen_hardisty,mixed_sweet_blend_edmonton,synthetic_edmonton,western_canada_select_chicago,western_canada_select_hardisty,wti_chicago,wti_cushing,wti_nymex_chicago
2014-01-01,,,,,460.0,431.0,,,,,...,627.24,1.0636,,,,,,,,
2014-01-02,752.74,746.65,766.8,,485.0,456.0,,,,,...,610.24,1.0633,,,,,,,,647.98
2014-01-03,658.99,741.17,761.33,,484.97,456.0,,,,,...,610.24,1.0614,,,,,,,,636.94
2014-01-04,,,,,479.97,451.0,,,,,...,627.24,1.0614,,,,,,,,
2014-01-05,,,,,479.97,451.0,,,,,...,627.24,1.0614,,,,,,,,


In [68]:
df.to_csv('nrcan.csv')

In [63]:
for lnk in links.keys():
    print(lnk)
    print(links[lnk])
    df = read_df(links[lnk])


October 2018
https://www.nrcan.gc.ca/energy/fuel-prices/crude/21532
September 2018
https://www.nrcan.gc.ca/energy/fuel-prices/crude/21452
August 2018
https://www.nrcan.gc.ca/energy/fuel-prices/crude/21418
July 2018
https://www.nrcan.gc.ca/energy/fuel-prices/crude/21302
June 2018
https://www.nrcan.gc.ca/energy/fuel-prices/crude/21231
May 2018
https://www.nrcan.gc.ca/energy/fuel-prices/crude/21160
April 2018
https://www.nrcan.gc.ca/energy/fuel-prices/crude/21098
March 2018
https://www.nrcan.gc.ca/energy/fuel-prices/crude/20910
February 2018
https://www.nrcan.gc.ca/energy/fuel-prices/crude/20729
January 2018
https://www.nrcan.gc.ca/energy/fuel-prices/crude/20609
December 2017
https://www.nrcan.gc.ca/energy/fuel-prices/crude/20485
November 2017
https://www.nrcan.gc.ca/energy/fuel-prices/crude/20353
October 2017
https://www.nrcan.gc.ca/energy/fuel-prices/crude/20222
September 2017
https://www.nrcan.gc.ca/energy/fuel-prices/crude/20083
August 2017
https://www.nrcan.gc.ca/energy/fuel-prices/c

In [53]:
df = read_df('https://www.nrcan.gc.ca/energy/fuel-prices/crude/19805')

ValueError: ('String does not contain a date:', '-')

In [None]:
# year_dict = {}
# for year in range(2014, 2019):
#     year_dict[year] = [lnk for lnk in links if lnk['title'].endswith(str(year))]

# eg_url = year_dict[2018][8]['url']
# eg_url
# def df_2018(link):
#     df = (
#         pd.read_html(link, header=0)[0]
#         .query('Date != "Average"')
#         .assign(Date=lambda df: pd.to_datetime(df['Date']))
#         .set_index('Date')
#         .sort_index()
#         .pipe(normalize_cols)
#         .apply(pd.to_numeric, errors='coerce')
#     )
#     return df
# df = df_2018(eg_url)
# df.head()
# dfs_2018 = [df_2018(yd['url']) for yd in year_dict[2018]]
# set(list(itertools.chain.from_iterable([list(df.columns) for df in dfs_2018])))

# dfs_2018_concat = pd.concat(dfs_2018, sort=True).sort_index()
# print(dfs_2018_concat.shape)
# dfs_2018_concat.tail()


# eg_url = year_dict[2017][8]['url']

# def df_2017(link):
#     df = (
#         pd.read_html(link, header=0)[0]
#         .query('Date != "Average"')
#         .assign(Date=lambda df: pd.to_datetime(df['Date']))
#         .set_index('Date')
#         .sort_index()
#         .pipe(normalize_cols)
#         .apply(pd.to_numeric, errors='coerce')
#     )
#     return df
# df = df_2017(eg_url)
# df.head()


# eg_url = year_dict[2016][8]['url']

# def df_2016(link):
#     badrows = ['Average', '$ Cdn/m3']
#     df = (
#         pd.read_html(link, header=0)[0]
#         .query('Date not in @badrows')
#         .assign(Date=lambda df: pd.to_datetime(df['Date']))
#         .set_index('Date')
#         .sort_index()
#         .pipe(normalize_cols)
#         .apply(pd.to_numeric, errors='coerce')
#     )
#     df = df.reindex(sorted(df.columns), axis=1)
#     return df
# df = df_2016(eg_url)
# df.head()


# eg_url = year_dict[2015][8]['url']

# def df_2015(link):
#     badrows = ['Average', '$ Cdn/m3']
#     df = (
#         pd.read_html(link, header=0)[0]
#         .query('Date not in @badrows')
#         .assign(Date=lambda df: pd.to_datetime(df['Date']))
#         .set_index('Date')
#         .sort_index()
#         .pipe(normalize_cols)
#         .apply(pd.to_numeric, errors='coerce')
#     )
#     df = df.reindex(sorted(df.columns), axis=1)
#     return df
# df = df_2015(eg_url)
# df.head()


# eg_url = year_dict[2014][8]['url']

# def df_2014(link):
#     badrows = ['Average', '$ Cdn/m3']
#     df = (
#         pd.read_html(link, header=0)[0]
#         .query('Date not in @badrows')
#         .assign(Date=lambda df: pd.to_datetime(df['Date']))
#         .set_index('Date')
#         .sort_index()
#         .pipe(normalize_cols)
#         .apply(pd.to_numeric, errors='coerce')
#     )
#     df = df.reindex(sorted(df.columns), axis=1)
#     return df
# df = df_2014(eg_url)
# df.head()

