# Automatic Source Retrieval

Done:
- Ontario
- Manitoba
- British Columbia
- New Brunswick
- Nova Scotia
- Northwest Territories
- Saskatchewan
- Nunavut
- Yukon
- Prince Edward Island
- Quebec
- Alberta

TODO:
- Newfoundland and Labrador

In [1]:
!python3 -m pip install bs4
!python3 -m pip install feedparser
!python3 -m pip install lxml



In [2]:
import requests
import urllib.request
import datetime
from bs4 import BeautifulSoup
import pandas as pd
import feedparser
from datetime import date
from datetime import datetime
import re

In [3]:
country = 'Canada'
src_cat = 'Government Website'
columns = ['start_date', 'country', 'region', 'subregion', 'source_url', 'source_category', 'source_title', 'source_full_text']

# Ontario

Since Ontario shows the most recent news on the first page, the range
will need to continue to be expanded to capture all posts. Generally ~4 pages capture ~2 weeks.

In [4]:
def _load_ontario(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of New Brunswick.
    """

    today = date.today()
    today_str = str(today).replace('-', '%2F')

    base_url = 'https://news.ontario.ca/en/search?content_type=all&utf8=%E2%9C%93&date_range_end=' + today_str + '&date_range_start=2020%2F01%2F01&date_select=desc&page='
#     targets = [base_url + str(i) for i in range(1,4)]

    region = 'Ontario'
    subregion = ''

    # Specific structure for news.contario.ca/archive
    rows = []
    page = 1
    while True:
        if verbose: print('Searching page ', page)
        target = base_url + str(page)

        response = requests.get(target)
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.findAll('article')

        if len(articles) == 0:
            if verbose: print('No articles found.')
            return pd.DataFrame(rows, columns=columns)

        for article in articles:
            smallersoup = BeautifulSoup(str(article), "html.parser")
            link = smallersoup.findAll('a')[0]['href']
            title = smallersoup.findAll('a')[0].string
            pub_date = datetime.strptime(smallersoup.time.string.replace('.', ''), "%B %d, %Y %I:%M %p")
            
            if pub_date < since:
                return pd.DataFrame(rows, columns=columns)
            
            response = requests.get(link)
            linksoup = BeautifulSoup(response.text, "html.parser")
            full_text = linksoup.article.text

            row = [pub_date, country, region, subregion, link, src_cat, title, full_text]
            rows.append(row)

        page += 1
            

# Manitoba

Retrieve all news releases in 2020 for the Province of Manitoba.

In [5]:
def _load_manitoba(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates
    
    Returns: a DataFrame containing news releases from the government of New Brunswick.
    """

    url_base = 'https://news.gov.mb.ca'
    targets = [url_base + '/news/index.html?month=' + str(i) + '&year=2020&day=01&bgnG=GO&d=' for i in range(12,1,-1)] # prevents stopping early

    region = 'Manitoba'
    subregion = ''

    rows = []
    for target in targets:
        if verbose: print(target)
        if target.startswith(url_base): #manitoba
            response = requests.get(target)
            soup = BeautifulSoup(response.text, "html.parser")
            items = soup.findAll("div", {"class": "maincontent"})
            smallersoup = BeautifulSoup(str(items), "html.parser")
            for article in smallersoup.findAll('h2'):
                a = article.a
                relative_link = a['href']
                link = url_base + relative_link.split('..')[-1]
                title = a.string

                response = requests.get(link)
                linksoup = BeautifulSoup(response.text, "html.parser")

                date_text = linksoup.findAll("span", {"class": "article_date"})[0].string
                pub_date = datetime.strptime(date_text, '%B %d, %Y') # January 31, 2020
                
                if pub_date < since:
                    return pd.DataFrame(rows, columns=columns)

                full_text = linksoup.findAll("div", {"class": ""})[0].text


                row = [pub_date, country, region, subregion, link, src_cat, title, full_text]
                rows.append(row)

                # Get this link and copy full text
    return pd.DataFrame(rows, columns=columns)

# British Columbia

In [6]:
def _load_british_columbia(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of New Brunswick.
    """
    
    region = 'British Columbia'
    subregion = ''

    query_url = 'https://news.gov.bc.ca/Search?FromDate=01/01/2020&Page='
    rows = []
    page = 1
    
    while True:
        if verbose: print("Page ", page)
        target = query_url + str(page)
        response = requests.get(target)
        soup = BeautifulSoup(response.text, "html.parser")
        items = soup.findAll("div", {"class": "article"})

        if not items:
            return pd.DataFrame(rows, columns=columns)

        for article in items:
            smallersoup = BeautifulSoup(str(article), "html.parser")

            #for article in smallersoup.findAll('div'):

            title = smallersoup.a.string

            date_text = smallersoup.findAll("div", {"class" : "item-date"})[0].string
            pub_date = datetime.strptime(date_text, '%A, %B %d, %Y %I:%M %p') # Friday, July 10, 2020 12:30 PM
            
            if pub_date < since:
                return pd.DataFrame(rows, columns=columns)

            link = smallersoup.a['href']

            response = requests.get(link)
            linksoup = BeautifulSoup(response.text, "html.parser")
            get_article = linksoup.findAll("article")
            if get_article:
                full_text = get_article[0].text
            else:
                if verbose: print("Couldn't retrieve full text for link: ", link)
                full_text = ""

            row = [pub_date, country, region, subregion, link, src_cat, title, full_text]
            rows.append(row)

        page += 1

# New Brunswick

In [7]:
def _load_new_brunswick(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    
    Returns: a DataFrame containing news releases from the government of New Brunswick.
    """
    region = 'New Brunswick'
    sub_region = ''
    
    url_base = "https://www2.gnb.ca/"
    url = url_base + "content/gnb/en/news/recent_news.html?mainContent_par_newslist_start="
    start = 0
    rows = []
    
    while True:
        if verbose: print("Page {}".format(str(start // 25 + 1)))
        response = requests.get(url + str(start))
        soup = BeautifulSoup(response.content, "html.parser")

        article_div = soup.find('div', class_="none padded")
        article_soup = BeautifulSoup(str(article_div), 'html.parser')
        articles = article_soup.find_all('li')

        for article in articles:
            small_soup = BeautifulSoup(str(article), 'html.parser')
            ar_date_str = small_soup.find('span', class_="post_date")
            
            if ar_date_str: # ensure list entry corresponds to dated article
                # Date
                ar_date = datetime.strptime(ar_date_str.text, "%d %B %Y")
                
                if ar_date < since: # only collect data after specified date
                    if verbose: print("Stopping search at date {}".format(ar_date))
                    return pd.DataFrame(rows, columns=columns) 
                
                a = article.a
                # Title
                title = a.text
                # Body
                relative_link = a['href']
                link = url_base + relative_link
                article_page = requests.get(link)
                body_soup = BeautifulSoup(article_page.content, 'html.parser')
                body = body_soup.find('div', class_="articleBody").text
                
                row = [ar_date, country, region, sub_region, link, src_cat, title, body]
                rows.append(row)
#                 print("{}: {}\n".format(ar_date, title))
                

        start += 25 # articles per page

# Nova Scotia

In [8]:
def _load_nova_scotia(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Nova Scotia. 
    """
    region = 'Nova Scotia'
    sub_region = ''
    
    url_base = "https://novascotia.ca/news"
    page = 1
    
    rows = []
    
    while True:
        url = url_base + "/search/?page=" + str(page)
        if verbose: print("Searching page {}".format(page))
        
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        titles = soup.find_all('dt', class_="RelTitle")
        summaries = soup.find_all('dd', class_="RelSummary")
        
        for title, summary in zip(titles, summaries):
            
            if title['lang'] == "fr": continue
                        
            ar_date = datetime.strptime(summary.time.text, "%B %d, %Y - %I:%M %p")
            
            if ar_date < since:
                if verbose: print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(rows, columns=columns)
            
            relative_link = title.a['href'].split('..', 1)[1]
            link = url_base + relative_link
            
            ar_response = requests.get(link)
            ar_soup = BeautifulSoup(ar_response.content, 'html.parser')
            body = ar_soup.find('div', {'id' : 'releaseBody'}).text
            
            row = [ar_date, country, region, sub_region, link, src_cat, title.text, body]
            rows.append(row)

            
        page += 1

# Northwest Territories

In [9]:
def _load_northwest_territories(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of the Northwest Territories.    
    """
    region = 'Northwest Territories'
    sub_region = ''
    
    url_base = "https://www.gov.nt.ca/"
    page = 0
    
    rows = []
    
    while True:
        url = url_base + "en/newsroom?page=" + str(page)
        if verbose: print("Searching page {}".format(page + 1))
        
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        ar_boxes = soup.find_all('div', class_ = re.compile('views-row')) # regex accounts for inconsistent `div` class names
        
        for box in ar_boxes:
            boxed_soup = BeautifulSoup(str(box), 'html.parser') # parse each div
            date_str = boxed_soup.find('span').text
            ar_date = datetime.strptime(date_str, "%B %d, %Y")
            
            if ar_date < since: 
                if verbose: print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(rows, columns=columns)
            
            title_a = boxed_soup.find('a')
            title = title_a.text
            relative_link = title_a['href']
            
            link = url_base + relative_link
            ar_res = requests.get(link)
            ar_soup = BeautifulSoup(ar_res.content, 'html.parser')
            body = ar_soup.find('div', class_ = "field-item even").text
            
            row = [ar_date, country, region, sub_region, link, src_cat, title, body]
            rows.append(row)
            
        page += 1

# Saskatchewan

In [10]:
def _load_saskatchewan(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Saskatchewan.
    """
    
    region = 'Saskatchewan'
    sub_region = ''
    
    url_base = "https://www.saskatchewan.ca/government/news-and-media?page="
    page = 1
    
    rows = []
    
    while True:
        url = url_base + str(page)
        if verbose: print("Searching page {}".format(page))
        
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        article_list = soup.find('ul', class_="results")
        article_soup = BeautifulSoup(str(article_list), 'html.parser')
        list_items = article_soup.find_all('li')
        
        for item in list_items:
            
            date_str = item.time['datetime']
            ar_date = datetime.strptime(date_str, "%Y-%m-%d")
            
            if ar_date < since: 
                if verbose: print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(rows, columns=columns)
            
            title = item.a.text
            link = item.a['href']
            
            body_soup = BeautifulSoup(requests.get(link).content, 'html.parser')
            body = body_soup.find('section', class_="general-content").text
            
            row = [ar_date, country, region, sub_region, link, src_cat, title, body]
            rows.append(row)
            
        page += 1

# Nunavut

In [11]:
def _load_nunavut(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Nunavut.
    
    Parameters: datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
    """
    
    region = 'Nunavut'
    sub_region = ''
    
    url_base = "https://gov.nu.ca"
    page = 0
    
    rows = []
    
    while True:
        url = url_base + "/news?page=" + str(page)
        if verbose: print("Searching page {}".format(page + 1))
        
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        main_section = soup.find('section', {"id" : "block-system-main"})
        main_section_soup = BeautifulSoup(str(main_section), 'html.parser')
        
        divs = main_section_soup.find_all('div', re.compile('views-row(.*)'))
        
        for div in divs:
            
            div_soup = BeautifulSoup(str(div), 'html.parser')
            date_str = div_soup.find('span', class_="date-display-single").text
            ar_date = datetime.strptime(date_str, "%d %B %Y")
            
            if ar_date < since: 
                if verbose: print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(rows, columns=columns)
            
            a = div_soup.find('a')
            title = a.text
            link = url_base + a['href']
                        
            body_soup = BeautifulSoup(requests.get(link).content, 'html.parser')
            body = body_soup.find('div', class_="region region-content").text
            
            row = [ar_date, country, region, sub_region, link, src_cat, title, body]
            rows.append(row)
            
        page += 1

# Yukon

In [12]:
def _load_yukon(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of the Yukon.
    """
    
    region = 'Yukon'
    sub_region = ''
    
    url_base = "https://yukon.ca"
    page = 0
    
    rows = []
    
    while True:
        url = url_base + "/news?page=" + str(page)
        if verbose: print("Searching page {}".format(page + 1))
        
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        main_div = soup.find('div', class_ = "view-content")
        main_div_soup = BeautifulSoup(str(main_div), 'html.parser')
        
        divs = main_div_soup.find_all('div', re.compile('views-row(.*)'))
        
        for div in divs:
            
            div_soup = BeautifulSoup(str(div), 'html.parser')
            date_str = div_soup.find('small').text
            ar_date = datetime.strptime(date_str, "%B %d, %Y")
            
            if ar_date < since: 
                if verbose: print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(rows, columns=columns)
            
            a = div_soup.find('a')
            title = a.text
            link = url_base + a['href']
                        
            body_soup = BeautifulSoup(requests.get(link).content, 'html.parser')
            body = body_soup.find('div', class_="region region-content").text
            row = [ar_date, country, region, sub_region, link, src_cat, title, body]
            rows.append(row)
            
        page += 1

# Prince Edward Island

In [13]:
def _load_pei(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Prince Edward Island.
    """
    
    region = 'Prince Edward Island'
    sub_region = ''
    
    url_base = "https://www.princeedwardisland.ca"
    page = 0
    
    rows = []
    
    while True:
        url = url_base + "/news?page=" + str(page)
        if verbose: print("Searching page {}".format(page + 1))
        
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        divs = soup.find_all('div', class_="right content views-fieldset")
        
        for div in divs:
                        
            div_soup = BeautifulSoup(str(div), 'html.parser')
            date_str = div_soup.find('div', class_="date").text
            ar_date = datetime.strptime(date_str, "%A, %B %d, %Y")
            
            if ar_date < since: 
                if verbose: print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(rows, columns=columns)
            
            a = div_soup.find('a')
            title = a.text
            link = url_base + a['href']
            
            body_soup = BeautifulSoup(requests.get(link).content, 'html.parser')
            body = body_soup.find('div', class_="maincontentmain").text
                        
            row = [ar_date, country, region, sub_region, link, src_cat, title, body]
            rows.append(row)
            
        page += 1

# Alberta

In [14]:
def _load_alberta(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Alberta.
    """
    
    region = 'Alberta'
    sub_region = ''
    
    days_back = (datetime.today() - since).days
    url = "https://www.alberta.ca/NewsRoom/newsroom.cfm?numDaysBack=" + str(days_back + 1)
    
    rows = []
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'xml')
        
    links = [link.text for link in soup.find_all('link')[2:]] # First two links are not articles
    titles = [title.text for title in soup.find_all('title')[2:]] # First two titles are not articles
    dates = [date.text for date in soup.find_all('pubDate')]
    
    for link, title, date in zip(links, titles, dates):
        
        ar_date = datetime.strptime(date, "%a, %d %b %Y %H:%M:%S -0600")
        
        ar_page_soup = BeautifulSoup(requests.get(link).content, 'html.parser')
        ar_main = ar_page_soup.find('main')
        body_soup = BeautifulSoup(str(ar_main), 'html.parser')
        body = body_soup.find('div', class_="goa-grid-100-100-100").text
        
        row = [ar_date, country, region, sub_region, link, src_cat, title, body]
        rows.append(row)
                
    return pd.DataFrame(rows, columns=columns)

# Quebec

In [15]:
def _load_quebec(since=datetime(2020, 1, 1), verbose=True):
    """
    Parameters: 
        - `since` 
            datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Quebec.
    """
    
    region = 'Quebec'
    sub_region = ''
    
    url_base = "http://www.fil-information.gouv.qc.ca/Pages/Articles.aspx?lang=en&Page="
    page = 1
    
    rows = []
    
    while True:
        url = url_base + str(page)
        
        if verbose: print("Searching page {}".format(page))
        
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
                
        sections = soup.find_all('section', {"id" : "articles"})
        
        for section in sections:
            date_str = section.time['datetime']
            ar_date = datetime.strptime(date_str, "%Y-%m-%d")
            
            if ar_date < since:
                if verbose: print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(rows, columns=columns)
            
            for a in section.find_all('a'):
            
                link = a['href']
                title = a.text.replace('\r', '')
                title = title.replace('\n', '')

                body_soup = BeautifulSoup(requests.get(link).content, 'html.parser')
                body = body_soup.find('div', class_="article").text

                row = [ar_date, country, region, sub_region, link, src_cat, title, body]
                rows.append(row)
            
        page += 1

# Update CSVs

These functions allow for the retrieval of only recent releases not currently stored in the CSVs. Needless to say, scraping all the way back to January 1st takes a lot of time.

In [56]:
def load_ontario(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Ontario.
    """
    ontario = pd.read_csv('sources/ontario.csv')
    ontario = ontario.drop('Unnamed: 0', axis=1) 
    
    ontario["start_date"] = pd.to_datetime(ontario["start_date"])
    
    largest_date = ontario["start_date"].max()        
    new_additions = _load_ontario(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(ontario).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/ontario.csv')
    return df

def load_quebec(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Quebec.
    """
    quebec = pd.read_csv('sources/quebec.csv')
    quebec = quebec.drop('Unnamed: 0', axis=1) 
    
    quebec["start_date"] = pd.to_datetime(quebec["start_date"])
    
    largest_date = quebec["start_date"].max()        
    new_additions = _load_quebec(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(quebec).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/quebec.csv')
    return df

def load_northwest_territories(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of the Northwest Territories.
    """
    northwest_territories = pd.read_csv('sources/northwestterritories.csv')
    northwest_territories = northwest_territories.drop('Unnamed: 0', axis=1) 
    
    northwest_territories["start_date"] = pd.to_datetime(northwest_territories["start_date"])
    
    largest_date = northwest_territories["start_date"].max()        
    new_additions = _load_northwest_territories(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(northwest_territories).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/northwestterritories.csv')
    return df

def load_yukon(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of the Yukon.
    """
    yukon = pd.read_csv('sources/yukon.csv')
    yukon = yukon.drop('Unnamed: 0', axis=1) 
    
    yukon["start_date"] = pd.to_datetime(yukon["start_date"])
    
    largest_date = yukon["start_date"].max()      
        
    new_additions = _load_yukon(since=largest_date, verbose=verbose)  
    
    df = new_additions.append(yukon).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/yukon.csv')
    return df

def load_new_brunswick(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of New Brunswick.
    """
    new_brunswick = pd.read_csv('sources/newbrunswick.csv')
    new_brunswick = new_brunswick.drop('Unnamed: 0', axis=1) 
    
    new_brunswick["start_date"] = pd.to_datetime(new_brunswick["start_date"])
    
    largest_date = new_brunswick["start_date"].max()        
    new_additions = _load_new_brunswick(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(new_brunswick).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/newbrunswick.csv')
    return df

def load_nova_scotia(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Nova Scotia.
    """
    nova_scotia = pd.read_csv('sources/novascotia.csv')
    nova_scotia = nova_scotia.drop('Unnamed: 0', axis=1) 
    
    nova_scotia["start_date"] = pd.to_datetime(nova_scotia["start_date"])
    
    largest_date = nova_scotia["start_date"].max()        
    new_additions = _load_nova_scotia(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(nova_scotia).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/novascotia.csv')
    return df

def load_alberta(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Alberta.
    """
    alberta = pd.read_csv('sources/alberta.csv')
    alberta = alberta.drop('Unnamed: 0', axis=1) 
    
    alberta["start_date"] = pd.to_datetime(alberta["start_date"])
    
    largest_date = alberta["start_date"].max()        
    new_additions = _load_alberta(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(alberta).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/alberta.csv')
    return df

def load_manitoba(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Manitoba.
    """
    manitoba = pd.read_csv('sources/manitoba.csv')
    manitoba = manitoba.drop('Unnamed: 0', axis=1) 
    
    manitoba["start_date"] = pd.to_datetime(manitoba["start_date"])
    
    largest_date = manitoba["start_date"].max()        
    new_additions = _load_manitoba(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(manitoba).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/manitoba.csv')
    return df

def load_pei(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Prince Edward Island.
    """
    pei = pd.read_csv('sources/pei.csv')
    pei = pei.drop('Unnamed: 0', axis=1) 
    
    pei["start_date"] = pd.to_datetime(pei["start_date"])
    
    largest_date = pei["start_date"].max()        
    new_additions = _load_pei(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(pei).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/pei.csv')
    return df

def load_british_columbia(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of British Columbia.
    """
    british_columbia = pd.read_csv('sources/britishcolumbia.csv')
    british_columbia = british_columbia.drop('Unnamed: 0', axis=1) 
    
    british_columbia["start_date"] = pd.to_datetime(british_columbia["start_date"])
    
    largest_date = british_columbia["start_date"].max()        
    new_additions = _load_british_columbia(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(british_columbia).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/britishcolumbia.csv')
    return df

def load_nunavut(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Nunavut.
    """
    nunavut = pd.read_csv('sources/nunavut.csv')
    nunavut = nunavut.drop('Unnamed: 0', axis=1) 
    
    nunavut["start_date"] = pd.to_datetime(nunavut["start_date"])
    
    largest_date = nunavut["start_date"].max()        
    new_additions = _load_nunavut(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(nunavut).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/nunavut.csv')
    return df

def load_saskatchewan(verbose=True):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates

    Returns: a DataFrame containing news releases from the government of Saskatchewan.
    """
    saskatchewan = pd.read_csv('sources/saskatchewan.csv')
    saskatchewan = saskatchewan.drop('Unnamed: 0', axis=1) 
    
    saskatchewan["start_date"] = pd.to_datetime(saskatchewan["start_date"])
    
    largest_date = saskatchewan["start_date"].max()        
    new_additions = _load_saskatchewan(since=largest_date, verbose=verbose)  
        
    df = new_additions.append(saskatchewan).drop_duplicates(['source_full_text', 'source_url'])
    df.to_csv('sources/saskatchewan.csv')
    return df

def load_provinces(verbose=False):
    """
    Parameters: 
        - `verbose`
            boolean, whether or not the function should print updates (False by default)

    Returns: a dictionary mapping the names of provinces and territories to DataFrames containing information about their new releases.
    """
    return {'alberta' : load_alberta(verbose), 
            'british columbia' : load_british_columbia(verbose), 
            'manitoba' : load_manitoba(verbose), 
            'new brunsiwck' : load_new_brunswick(verbose), 
            'northwest territories' : load_northwest_territories(verbose), 
            'nova scotia' : load_nova_scotia(verbose), 
            'nunavut' : load_nunavut(verbose), 
            'ontario' : load_ontario(verbose), 
            'pei' : load_pei(verbose), 
            'quebec' : load_quebec(verbose), 
            'saskatchewan' : load_saskatchewan(verbose), 
            'yukon' : load_yukon(verbose)
           }

In [59]:
%%time
l = load_provinces()

NameError: name 'pei' is not defined