# Automatic Source Retrieval

Done:
- Ontario
- Manitoba
- British Columbia
- New Brunswick
- Nova Scotia
- Northwest Territories
- Saskatchewan

TODO:
- Nunavut
- Yukon
- Alberta
- Prince Edward Island
- Quebec
- Newfoundland and Labrador

In [None]:
!python3 -m pip install bs4
!python3 -m pip install feedparser

In [None]:
import requests
import urllib.request
import datetime
from bs4 import BeautifulSoup
import pandas as pd
import feedparser
from datetime import date
from datetime import datetime
import re

In [None]:
country = 'Canada'
src_cat = 'Government Website'
columns = ['start_date', 'country', 'region', 'subregion', 'source_url', 'source_category', 'source_title', 'source_full_text']

# Ontario

Since Ontario shows the most recent news on the first page, the range
will need to continue to be expanded to capture all posts. Generally ~4 pages capture ~2 weeks.

In [None]:
def load_ontario():
    """
    Returns: a DataFrame containing news releases from the government of New Brunswick.
    """

    today = date.today()
    today_str = str(today).replace('-', '%2F')

    base_url = 'https://news.ontario.ca/en/search?content_type=all&utf8=%E2%9C%93&date_range_end=' + today_str + '&date_range_start=2020%2F01%2F01&date_select=desc&page='
    targets = [base_url + str(i) for i in range(1,4)]

    region = 'Ontario'
    subregion = ''

    # Specific structure for news.contario.ca/archive
    links = []
    page = 1
    while True:
        print('Searching page ', page)
        target = base_url + str(page)

        response = requests.get(target)
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.findAll('article')

        if len(articles) == 0:
            print('No articles found.')
            return pd.DataFrame(links, columns=columns)

        for article in articles:
            smallersoup = BeautifulSoup(str(article), "html.parser")
            link = smallersoup.findAll('a')[0]['href']
            title = smallersoup.findAll('a')[0].string
            pub_date = datetime.strptime(smallersoup.time.string.replace('.', ''), "%B %d, %Y %I:%M %p")

            response = requests.get(link)
            linksoup = BeautifulSoup(response.text, "html.parser")
            full_text = linksoup.article.text

            row = [pub_date, country, region, subregion, link, src_cat, title, full_text]
            links.append(row)

        page += 1
            

In [None]:
%%time

df = load_ontario()

In [None]:
df.shape
df.to_csv('sources/ontario.csv')

# Manitoba

Retrieve all news releases in 2020 for the Province of Manitoba.

In [None]:
def load_manitoba():
    """
    Returns: a DataFrame containing news releases from the government of New Brunswick.
    """

    url_base = 'https://news.gov.mb.ca'
    targets = [url_base + '/news/index.html?month=' + str(i) + '&year=2020&day=01&bgnG=GO&d=' for i in range(1,12)]

    region = 'Manitoba'
    subregion = ''

    links = []
    for target in targets:
        print(target)
        if target.startswith(url_base): #manitoba
            response = requests.get(target)
            soup = BeautifulSoup(response.text, "html.parser")
            items = soup.findAll("div", {"class": "maincontent"})
            smallersoup = BeautifulSoup(str(items), "html.parser")
            for article in smallersoup.findAll('h2'):
                a = article.a
                relative_link = a['href']
                link = url_base + relative_link.split('..')[-1]
                title = a.string

                response = requests.get(link)
                linksoup = BeautifulSoup(response.text, "html.parser")

                date_text = linksoup.findAll("span", {"class": "article_date"})[0].string
                date = pd.to_datetime(date_text, format='%B %d, %Y')
                pub_date = date.strftime('%m/%d/%Y')

                full_text = linksoup.findAll("div", {"class": ""})[0].text


                row = [pub_date, country, region, subregion, link, src_cat, title, full_text]
                links.append(row)

                # Get this link and copy full text
    return pd.DataFrame(links, columns=columns)

In [None]:
%%time

df = load_manitoba()
df.to_csv('sources/manitoba.csv')
df.shape

# British Columbia

In [None]:
def load_british_columbia():
    """
    Returns: a DataFrame containing news releases from the government of New Brunswick.
    """
    
    region = 'British Columbia'
    subregion = ''

    query_url = 'https://news.gov.bc.ca/Search?FromDate=01/01/2020&Page='
    links = []
    page = 1
    
    while True:
        print("Page ", page)
        target = query_url + str(page)
        response = requests.get(target)
        soup = BeautifulSoup(response.text, "html.parser")
        items = soup.findAll("div", {"class": "article"})

        if not items:
            return pd.DataFrame(links, columns=columns)

        for article in items:
            smallersoup = BeautifulSoup(str(article), "html.parser")

            #for article in smallersoup.findAll('div'):

            title = smallersoup.a.string

            date_text = smallersoup.findAll("div", {"class" : "item-date"})[0].string
            date = pd.to_datetime(date_text)
            pub_date = date.strftime('%m/%d/%Y')

            link = smallersoup.a['href']

            response = requests.get(link)
            linksoup = BeautifulSoup(response.text, "html.parser")
            get_article = linksoup.findAll("article")
            if get_article:
                full_text = get_article[0].text
            else:
                print("Couldn't retrieve full text for link: ", link)
                full_text = ""

            row = [pub_date, country, region, subregion, link, src_cat, title, full_text]
            links.append(row)

        page += 1

In [None]:
%%time

df = load_british_columbia()
df.shape

In [None]:
df.to_csv('sources/britishcolumbia.csv')

# New Brunswick

In [None]:
def load_new_brunswick(since=datetime(2020, 1, 1)):
    """
    Returns: a DataFrame containing news releases from the government of New Brunswick.
    
    Parameters: datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
    """
    region = 'New Brunswick'
    sub_region = ''
    
    url_base = "https://www2.gnb.ca/"
    url = url_base + "content/gnb/en/news/recent_news.html?mainContent_par_newslist_start="
    start = 0
    links = []
    
    while True:
        print("Page {}".format(str(start // 25 + 1)))
        request = requests.get(url + str(start))
        soup = BeautifulSoup(request.content, "html.parser")

        article_div = soup.find('div', class_="none padded")
        article_soup = BeautifulSoup(str(article_div), 'html.parser')
        articles = article_soup.find_all('li')

        for article in articles:
            small_soup = BeautifulSoup(str(article), 'html.parser')
            ar_date_str = small_soup.find('span', class_="post_date")
            
            if ar_date_str: # ensure list entry corresponds to dated article
                # Date
                ar_date = datetime.strptime(ar_date_str.text, "%d %B %Y")
                
                if ar_date < since: # only collect data after specified date
                    print("Stopping search at date {}".format(ar_date))
                    return pd.DataFrame(links, columns=columns) 
                
                a = article.a
                # Title
                title = a.text
                # Body
                relative_link = a['href']
                link = url_base + relative_link
                article_page = requests.get(link)
                body_soup = BeautifulSoup(article_page.content, 'html.parser')
                body = body_soup.find('div', class_="articleBody").text
                
                row = [ar_date, country, region, sub_region, link, src_cat, title, body]
                links.append(row)
#                 print("{}: {}\n".format(ar_date, title))
                

        start += 25 # articles per page

In [None]:
%%time

df = pd.DataFrame(load_new_brunswick(), columns=columns)
df.shape

In [None]:
df.to_csv('sources/newbrunswick.csv')

# Nova Scotia

In [None]:
def load_nova_scotia(since=datetime(2020, 1, 1)):
    """
    Returns: a DataFrame containing news releases from the government of Nova Scotia.
    
    Parameters: datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
    
    """
    region = 'Nova Scotia'
    sub_region = ''
    
    url_base = "https://novascotia.ca/news"
    page = 1
    
    links = []
    
    while True:
        url = url_base + "/search/?page=" + str(page)
        print("Searching page {}".format(page))
        
        request = requests.get(url)
        soup = BeautifulSoup(request.content, 'html.parser')
        
        titles = soup.find_all('dt', class_="RelTitle")
        summaries = soup.find_all('dd', class_="RelSummary")
        
        for title, summary in zip(titles, summaries):
            
            if title['lang'] == "fr": continue
                        
            ar_date = datetime.strptime(summary.time.text, "%B %d, %Y - %I:%M %p")
            
            if ar_date < since:
                print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(links, columns=columns)
            
            relative_link = title.a['href'].split('..', 1)[1]
            link = url_base + relative_link
            
            ar_request = requests.get(link)
            ar_soup = BeautifulSoup(ar_request.content, 'html.parser')
            body = ar_soup.find('div', {'id' : 'releaseBody'}).text
            
            row = [ar_date, country, region, sub_region, link, src_cat, title.text, body]
            links.append(row)

            
        page += 1

In [None]:
%%time

df = load_nova_scotia()
df.shape

In [None]:
df.to_csv('sources/novascotia.csv')

In [None]:
def load_northwest_territories(since=datetime(2020, 1, 1)):
    """
    Returns: a DataFrame containing news releases from the government of the Northwest Territories.
    
    Parameters: datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
    
    """
    region = 'Northwest Territories'
    sub_region = ''
    
    url_base = "https://www.gov.nt.ca/"
    page = 0
    
    links = []
    
    while True:
        url = url_base + "en/newsroom?page=" + str(page)
        print("Searching page {}".format(page + 1))
        
        request = requests.get(url)
        soup = BeautifulSoup(request.content, 'html.parser')
        
        ar_boxes = soup.find_all('div', class_ = re.compile('views-row')) # regex accounts for inconsistent `div` class names
        
        for box in ar_boxes:
            boxed_soup = BeautifulSoup(str(box), 'html.parser') # parse each div
            date_str = boxed_soup.find('span').text
            ar_date = datetime.strptime(date_str, "%B %d, %Y")
            
            if ar_date < since: 
                print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(links, columns=columns)
            
            title_a = boxed_soup.find('a')
            title = title_a.text
            relative_link = title_a['href']
            
            link = url_base + relative_link
            ar_req = requests.get(link)
            ar_soup = BeautifulSoup(ar_req.content, 'html.parser')
            body = ar_soup.find('div', class_ = "field-item even").text
            
            row = [ar_date, country, region, sub_region, link, src_cat, title, body]
            links.append(row)
            
        page += 1

In [None]:
%%time

df = load_northwest_territories()
df.to_csv('sources/northwestterritories.csv')

# Saskatchewan

In [None]:
def load_saskatchewan(since=datetime(2020, 1, 1)):
    """
    Returns: a DataFrame containing news releases from the government of Saskatchewan.
    
    Parameters: datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
    """
    
    region = 'Saskatchewan'
    sub_region = ''
    
    url_base = "https://www.saskatchewan.ca/government/news-and-media?page="
    page = 1
    
    links = []
    
    while True:
        url = url_base + str(page)
        print("Searching page {}".format(page))
        
        request = requests.get(url)
        soup = BeautifulSoup(request.content, 'html.parser')
        
        article_list = soup.find('ul', class_="results")
        article_soup = BeautifulSoup(str(article_list), 'html.parser')
        list_items = article_soup.find_all('li')
        
        for item in list_items:
            
            date_str = item.time['datetime']
            ar_date = datetime.strptime(date_str, "%Y-%m-%d")
            
            if ar_date < since: 
                print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(links, columns=columns)
            
            title = item.a.text
            link = item.a['href']
            
            body_soup = BeautifulSoup(requests.get(link).content, 'html.parser')
            body = body_soup.find('section', class_="general-content").text
            
            row = [ar_date, country, region, sub_region, link, src_cat, title, body]
            links.append(row)
            
        page += 1

In [None]:
%%time

df = load_saskatchewan()
df.to_csv('sources/saskatchewan.csv')

# Nunavut

In [None]:
def load_nunavut(since=datetime(2020, 1, 1)):
    """
    Returns: a DataFrame containing news releases from the government of Nunavut.
    
    Parameters: datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
    """
    
    region = 'Nunavut'
    sub_region = ''
    
    url_base = "https://gov.nu.ca"
    page = 0
    
    links = []
    
    while True:
        url = url_base + "/news?page=" + str(page)
        print("Searching page {}".format(page + 1))
        
        request = requests.get(url)
        soup = BeautifulSoup(request.content, 'html.parser')
        
        main_section = soup.find('section', {"id" : "block-system-main"})
        main_section_soup = BeautifulSoup(str(main_section), 'html.parser')
        
        divs = main_section_soup.find_all('div', re.compile('views-row(.*)'))
        
        for div in divs:
            
            div_soup = BeautifulSoup(str(div), 'html.parser')
            date_str = div_soup.find('span', class_="date-display-single").text
            ar_date = datetime.strptime(date_str, "%d %B %Y")
            
            if ar_date < since: 
                print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(links, columns=columns)
            
            a = div_soup.find('a')
            title = a.text
            link = url_base + a['href']
                        
            body_soup = BeautifulSoup(requests.get(link).content, 'html.parser')
            body = body_soup.find('div', class_="region region-content").text
            
            row = [ar_date, country, region, sub_region, link, src_cat, title, body]
            links.append(row)
            
        page += 1

In [None]:
%%time

df = load_nunavut()
df.to_csv('sources/nunavut.csv')

# Yukon

In [84]:
def load_yukon(since=datetime(2020, 1, 1)):
    """
    Returns: a DataFrame containing news releases from the government of the Yukon.
    
    Parameters: datetime object, the date of the earliest news release to be retrieved. By default, only the releases published since Jan 1 2020 are retrieved.
    """
    
    region = 'Yukon'
    sub_region = ''
    
    url_base = "https://yukon.ca"
    page = 0
    
    links = []
    
    while True:
        url = url_base + "/news?page=" + str(page)
        print("Searching page {}".format(page + 1))
        
        request = requests.get(url)
        soup = BeautifulSoup(request.content, 'html.parser')
        
        main_div = soup.find('div', class_ = "view-content")
        main_div_soup = BeautifulSoup(str(main_div), 'html.parser')
        
        divs = main_div_soup.find_all('div', re.compile('views-row(.*)'))
        
        for div in divs:
            
            div_soup = BeautifulSoup(str(div), 'html.parser')
            date_str = div_soup.find('small').text
            ar_date = datetime.strptime(date_str, "%B %d, %Y")
            
            if ar_date < since: 
                print("Stopping search at date {}".format(ar_date))
                return pd.DataFrame(links, columns=columns)
            
            a = div_soup.find('a')
            title = a.text
            link = url_base + a['href']
                        
            body_soup = BeautifulSoup(requests.get(link).content, 'html.parser')
            body = body_soup.find('div', class_="region region-content").text
            row = [ar_date, country, region, sub_region, link, src_cat, title, body]
            links.append(row)
            
        page += 1

In [85]:
%%time

df = load_yukon()
df.to_csv('sources/yukon.csv')

Searching page 1
Searching page 2
Searching page 3
Searching page 4
Searching page 5
Searching page 6
Searching page 7
Searching page 8
Searching page 9
Searching page 10
Searching page 11
Searching page 12
Searching page 13
Searching page 14
Searching page 15
Searching page 16
Searching page 17
Searching page 18
Searching page 19
Searching page 20
Stopping search at date 2019-12-31 00:00:00
CPU times: user 1min 6s, sys: 1.13 s, total: 1min 8s
Wall time: 4min 27s
