In [None]:
from pymongo import MongoClient
import pandas as pd
import json
import re

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to C:\Users\James has a
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

C:\Users\James has a PC\AppData\Roaming\nltk_data

In [13]:
with open("config.json") as f:
    config = json.load(f)

password = config["MONGO_PASSWORD"]

# MongoDB connection
client = MongoClient(f"mongodb+srv://bootsmajames:{password}@jamesbcluster.wdq3i.mongodb.net/")
db = client["bank_of_canada"]
collection = db["monetary_policy_reports"]

# Load data into a pandas DataFrame
data = pd.DataFrame(list(collection.find()))

In [25]:
def clean_text(text):
    if not isinstance(text, str):
        return text
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s.,]', '', text)  # Remove special characters except punctuation
    text = text.strip().lower()  # Normalize case and trim
    return text

data["lead"] = data["lead"].apply(clean_text)
data["pdf_text"] = data["pdf_text"].apply(clean_text)
data["pr_title"] = data["pr_title"].apply(clean_text)
data["pr_body"] = data["pr_body"].apply(clean_text)

In [31]:
def tokenize_text(text):
    if not isinstance(text, str):
        return text
    return word_tokenize(text)

data["lead_tokens"] = data["lead"].apply(tokenize_text)
data["pdf_text_tokens"] = data["pdf_text"].apply(tokenize_text)
data["pr_title_tokens"] = data["pr_title"].apply(tokenize_text)
data["pr_body_tokens"] = data["pr_body"].apply(tokenize_text)

In [43]:
def chunk_tokens(tokens, chunk_size=500):
    return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

data["pdf_chunks"] = data["pdf_text_tokens"].apply(lambda x: chunk_tokens(x, chunk_size=500))

In [45]:
data["llm_input"] = data.apply(lambda row: {
    "summary": row["lead_tokens"],
    "interest_rate_decision": row["pr_title_tokens"],
    "decision_reasoning": row["pr_body_tokens"],
    "report_chunks": row["pdf_chunks"]
}, axis=1)

In [46]:
data["llm_input"].to_json("llm_input_data.json", orient="records", lines=True)