In [1]:
import collections
import multiprocessing
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
save_dir = Path('raw-data')

In [3]:
provincies = [
    "AB",
    "BC",
    "MB",
    "NB",
    "NL",
    "NS",
    "NT",
    "NU",
    "ON",
    "PE",
    "QC",
    "SK",
    "YT",
]

In [4]:
baseurl = 'https://dd.weather.gc.ca/climate/observations/hourly/csv/{provence}/'

In [5]:
# response = requests.get(baseurl.format(provence='MB'))
# html_body = response.content.decode()

In [6]:
def url_generator():
    '''
    Note that many BeautifulSoup trees can't be pickled.
    '''
    for provence in provincies:
        url = baseurl.format(provence=provence)
        print(url)
        response = requests.get(url)

        if response.status_code != 200:
            continue
        
        html_body = response.content.decode()
        soup = BeautifulSoup(html_body, 'html.parser')
    
        for link in sorted(
            soup.body.find_all('a'),
            key=lambda item: item.get('href'),
        ):
            uri = link.get('href')
            if not uri.endswith('.csv'):
                continue

            full_url = f'{url}{uri}'

            yield provence, full_url

In [7]:
def load_data(data):
    provence, url = data    
    if not url.endswith('.csv'):
        return

    save_subdir = save_dir / provence
    save_subdir.mkdir(parents=True, exist_ok=True)

    # get file name
    filename = Path(url).name
    filepath = save_subdir / filename

    if filepath.is_file():
        # file already exists
        return
    
    try:
        df = pd.read_csv(url, encoding= 'unicode_escape')
    except Exception as e:
        print(url)
        print(str(e))
    else:
        with filepath.open('w') as flink:
            df.to_csv(flink, index=False)

        # there may be lots in memeory
        del df

In [8]:
# we want to speed this up but not thrash the server and
# get throttled
with multiprocessing.Pool(processes=4) as pool:
    for (idx,_) in enumerate(pool.imap_unordered(
        func=load_data,
        iterable=url_generator(),
        chunksize=32,
    )):
        if idx % 10000 == 0:
            print(idx)
    print('complete')

https://dd.weather.gc.ca/climate/observations/hourly/csv/AB/
0
https://dd.weather.gc.ca/climate/observations/hourly/csv/BC/
https://dd.weather.gc.ca/climate/observations/hourly/csv/MB/
https://dd.weather.gc.ca/climate/observations/hourly/csv/NB/
https://dd.weather.gc.ca/climate/observations/hourly/csv/NL/
10000
https://dd.weather.gc.ca/climate/observations/hourly/csv/NS/
https://dd.weather.gc.ca/climate/observations/hourly/csv/NT/
https://dd.weather.gc.ca/climate/observations/hourly/csv/NU/
https://dd.weather.gc.ca/climate/observations/hourly/csv/ON/
https://dd.weather.gc.ca/climate/observations/hourly/csv/NU/climate_hourly_NU_2300902_1992_P1H.csv
Error tokenizing data. C error: EOF inside string starting at row 4982
https://dd.weather.gc.ca/climate/observations/hourly/csv/NT/climate_hourly_NT_2201579_2009_P1H.csv
Error tokenizing data. C error: EOF inside string starting at row 7573
https://dd.weather.gc.ca/climate/observations/hourly/csv/PE/
20000
https://dd.weather.gc.ca/climate/obs