In [1]:
from bs4 import BeautifulSoup
import requests
from markitdown import MarkItDown
from pymongo import MongoClient

# Initialize the Markdown converter
md = MarkItDown()

In [40]:
class MonetaryPolicyReport:
    """Class representing a monetary policy report."""
    def __init__(self, title):
        self.title = title
        self.date = ""
        self.url = ""
        self.lead = ""
        self.decision = ""
        self.body = ""
        self.pdf_url = ""
        self.pdf_text = ""

    def to_dict(self):
        """Convert the object to a dictionary."""
        return {
            "title": self.title,
            "date": self.date,
            "url": self.url,
            "lead": self.lead,
            "decision": self.decision,
            "body": self.body,
            "pdf_url": self.pdf_url,
            "pdf_text": self.pdf_text
        }

In [3]:
def GetReportType(soup):
    """Determines the type of report based on the HTML structure.

    Args:
        soup (BeautifulSoup): Parsed HTML content of the report page.

    Returns:
        int: Report type identifier (1, 2, 3, or -1 if not found).
    """
    pdf1 = soup.find('div', class_="lead no-print")
    pdf2 = soup.find('div', class_="post-formats lead")

    if pdf1 is None and pdf2 is None:
        return -1

    if soup.find('h1', class_="bocss-hero__title"):
        return 1
    elif soup.find('p', class_='lead'):
        return 2
    else:
        return 3

In [12]:
def BuildReport(media_body):
    """Builds a monetary policy report object from the given media body.

    Args:
        media_body (Tag): HTML tag containing the media body information.

    Returns:
        MonetaryPolicyReport: A fully populated monetary policy report object.
    """
    # Extract the report title and URL from the media body
    media_body_a = media_body.find('a')
    report = MonetaryPolicyReport(media_body_a.text)
    report.date = media_body.find('span', class_="bocss-margin-left-medium media-date pull-right").text
    report.url = media_body_a.get('href')

    # Fetch and parse the report page
    page = requests.get(report.url)
    soup = BeautifulSoup(page.text, 'html.parser')

    report_type = GetReportType(soup)

    if report_type == -1:
        print(f"Unable to determine report type for: {report.title}")
        return None

    # Extract lead paragraph and decision information
    if report_type <= 2:
        report.lead = soup.find('p', class_='lead').text
    else:
        report.lead = soup.find('div', class_ = 'post-content').text
    
    report.decision = soup.find_all('h2', class_='cfct-mod-title post-callout-title')[-1].text.strip()    

    # Extract body for report type 1
    if report_type == 1:
        for cfct_model in soup.find_all('div', class_="cfct-module cfct-widget-module-bochtml"):
            h2 = cfct_model.find('h2')
            if h2 is not None and h2.text == 'Overview':
                report.body = '\n'.join([p.text for p in cfct_model.find_all('p')])
                break

    # Extract PDF URL based on report type
    pdf_div_class = "lead no-print" if report_type == 1 else "post-formats lead"
    pdf_div = soup.find('div', class_=pdf_div_class)
    report.pdf_url = pdf_div.find('a').get('href')

    # Convert the PDF content using MarkItDown (assumes PDF URL text can be processed directly)
    result = md.convert(report.pdf_url)
    report.pdf_text = result.text_content

    return report

In [42]:
# Fetch and process reports from multiple pages
reports = []
for i in range(1,13):
    url = f"https://www.bankofcanada.ca/publications/mpr/?mt_page={i}"
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')

    media_bodies = soup.find_all('div', class_ = 'media-body')

    for media_body in media_bodies:
        report = BuildReport(media_body)
        if report:
            reports.append(report)

Unable to determine report type for: Qualitative Research on the Monetary Policy Report – Key Findings, 2009


In [45]:
client = MongoClient("mongodb+srv://bootsmajames:szJbmif1imDIAi3j@jamesbcluster.wdq3i.mongodb.net/")
db = client["bank_of_canada"]
collection = db["monetary_policy_reports"]

for report in reports:
    collection.insert_one(report.to_dict())

print("Data inserted successfully!")

  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\James has a PC\OneDrive\Documents\Projects\LLM\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\James has a PC\OneDrive\Documents\Projects\LLM\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\James has a PC\OneDrive\Documents\Projects\LLM\.venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\James has a PC\OneDrive\Documents\Projects\LLM\.venv\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\James has a PC\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 607, in run_forever
    self._run_once()
  File "C:\Users\James has a PC\AppData\Local\Programs\Python\Python311\Lib\asyncio\bas

Data inserted successfully!
