In [1]:
# importing
from bs4 import BeautifulSoup, SoupStrainer, NavigableString
import requests
import os
from dotenv import load_dotenv

# langchain
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

In [45]:
# creating scrapable objects from 2 landing pages (acts and regulations)

landing_pages = ["https://laws-lois.justice.gc.ca/eng/acts/", "https://laws-lois.justice.gc.ca/eng/regulations/"]

# creating a list of doc titles
doc_titles_set = set()

doc_titles = []
doc_links = []

for landing_page in landing_pages:
  # creating a soup for each page
  page = requests.get(landing_page)
  soup = BeautifulSoup(page.text, features="html.parser")


  alpha_link = ""

  # entering all links for alphabetical act/regulation pages into list
  for item in soup.find_all("a"):
    try:
      if item["class"] == ["btn", "btn-default"]:
        alpha_page = item["href"]
        alpha_link = f"{landing_page}{alpha_page}"

        # soup alphabetical link
        alpha_page = requests.get(alpha_link)
        alpha_soup = BeautifulSoup(alpha_page.text, features="html.parser")

        # soup for each individual act/regulation from alphabetical link
        for item in alpha_soup.find_all("a"):
          if "class" in item.attrs.keys():
            if item.attrs["class"] == ["TocTitle"]:
              if item.string != None and (str(item.string[2:-2]) not in doc_titles_set):
                doc_titles_set.add(str(item.string[2:-2]))
                doc_titles.append(str(item.string[2:-2]))
                doc_links.append(f"{landing_page}{item.get('href')}")
    except: 
      continue


In [46]:
doc_titles.append("The Constitution Acts 1867 to 1982")
doc_links.append("https://laws-lois.justice.gc.ca/eng/Const/index.html")

In [47]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [48]:
descriptions = []
metadatas = []

for i in range(25):
  name = doc_titles[i]
  link = doc_links[i]


  # take doc_title and generate description 
  model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
  parser = StrOutputParser()
  template = """
            You are a legal summarization assistant specialized in Canadian law. 
            Your task is to write a short, plain-language description of the following legal document. 
            The goal is to help ordinary Canadians understand what the document is about.

            Document Name: {doc_name}

            Instructions:
            - Write in clear, everyday English (no legal jargon).
            - Keep it concise: 2–4 sentences.
            - Focus on what the Act or Regulation does, its purpose, and who it applies to.
            - Do not include section numbers or citations.
            - Output only the summary.

            Summary:
            """
  
  prompt = ChatPromptTemplate.from_template(template)
  

  chain = prompt | model | parser
  description = chain.invoke({
      "doc_name": f"{doc_titles[i]}"
  })


  descriptions.append(description)
  metadata = {}

  metadata["doc_title"] = doc_titles[i]
  metadata["doc_link"] = doc_links[i]
  metadata["description"] = True

  metadatas.append(metadata)


print(descriptions)
print(metadatas)

['The Access to Information Act is a Canadian law that allows individuals to request access to government records. This Act aims to promote transparency and accountability within the government by giving citizens the right to access information held by federal institutions. It applies to all Canadian citizens, permanent residents, and anyone present in Canada.', 'The Accessible Canada Act is a law in Canada that aims to make sure that everyone, including people with disabilities, can fully participate in society. It sets out rules for organizations to follow to make their services and facilities more accessible. This Act applies to federal government organizations and industries under federal jurisdiction.', 'The Addition of Lands to Reserves and Reserve Creation Act is a law that allows Indigenous communities in Canada to expand their existing reserves or create new ones. This Act helps Indigenous groups secure more land for their communities, better meet their needs, and support thei

In [49]:
# creating pinecone index

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

# Delete old index
pc.delete_index("bettercallpaul-index")

# Create new index with correct dim
pc.create_index(
    name="bettercallpaul-index",
    dimension=3072,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [50]:
from langchain_core.documents import Document
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

# creating list of document objects
documents = []

for i in range(len(descriptions)):
  documents.append(Document(page_content=descriptions[i], metadata=metadatas[i]))


# creating embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# storing into vectorestore
index_name = "bettercallpaul-index"

vectorDB = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)

In [95]:
# test if prompt is retreiving correct document name from vector store
doc, score = vectorDB.similarity_search_with_relevance_scores(
    "What is the Access to Information Act?"
)[1]

print(doc.page_content)   # NOT ["page_content"]
print(score)

Full Document:  Act current to 2025-10-28 and last amended on 2025-06-02. Previous VersionsR.S.C., 1985, c. A-1An Act to extend the present laws of Canada that provide access to information under the control of the Government of Canada and to provide for the proactive publication of certain informationShort TitleShort title1 This Act may be cited as the Access to Information Act.Purpose of ActPurpose of Act2 (1) The purpose of this Act is to enhance the accountability and transparency of federal institutions in order to promote an open and democratic society and to enable public debate on the conduct of those institutions.Specific purposes of Parts 1 and 2(2) In furtherance of that purpose,(a) Part 1 extends the present laws of Canada to provide a right of access to information in records under the control of a government institution in accordance with the principles that government information should be available to the public, that necessary exceptions to the right of access should b

In [53]:
print(doc_links[-1])

https://laws-lois.justice.gc.ca/eng/Const/index.html


In [54]:
for i in range(25):
  #   put each document into temp file
  name = doc_titles[i]  # metadata usage
  link = doc_links[i]  # metadata usage
  try:
    # if the page can be found, create soup
    scrapable_link = requests.get(f"{link[:-11]}/FullText.html")  # getting rid of /.index.html from the end and replacing with /FullText.html
    scrapable_link.encoding = scrapable_link.apparent_encoding  # utf-8 encoding
    scrapable_soup = BeautifulSoup(scrapable_link.text, "html.parser")


    # write scraped data into temporary file
    with open(f"data/scraped_content/temp_file.txt", "w", encoding="utf-8") as file:
        for item in scrapable_soup.find_all(["p", "h2", "h3", "h4"]):
          if (item.name == "p"):
            for descendant in item.descendants:
              if isinstance(descendant, NavigableString):
                if str(descendant) != "Marginal note:":
                  file.write(descendant)
          
          elif (item.name == "h2"):
            try:
              if "Part" in item["class"]:
                for descendant in item.descendants:
                  if isinstance(descendant, NavigableString):
                    if str(descendant) != "Marginal note:":
                      file.write(descendant)
            except:
              continue
          
          elif (item.name ==  "h3" or item.name == "h4"):
            try:
              if "Subheading" in item["class"]:
                for descendant in item.descendants:
                  if isinstance(descendant, NavigableString):
                    if str(descendant) != "Marginal note:":
                      file.write(descendant)
            except:
              continue
    

    #   chunk from the temp file

    from langchain_community.document_loaders.text import TextLoader

    loader = TextLoader("data/scraped_content/temp_file.txt",
                        encoding="utf-8",
                        autodetect_encoding=True)

    text_documents = loader.load()

    
    from langchain.text_splitter import RecursiveCharacterTextSplitter

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1400, chunk_overlap=525)
    chunks = text_splitter.split_documents(text_documents)

  
    # modify chunks list to insert proper metadata
    for j in range(len(chunks)):
      chunks[j].metadata = {
        "doc_title":name,
        "doc_link":link,
        "description": False
      }


    #   insert chunks into vector database along with metadata
    vectorDB = PineconeVectorStore.from_documents(
    chunks, embeddings, index_name=index_name
    )
  
  except Exception as e:
    print(f"Error on {name}: {e}")
    continue  # move onto next document

In [61]:
from pinecone import Pinecone
import os

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
index = pc.Index(index_name)

ids = []

for page in index.list(prefix="", limit=100):
    for vec in page:    # vec is a string like "uuid"
        ids.append(vec)


print(ids[:20])

['002c9d59-aeee-4f3e-b067-4f096d8a6e1e', '003b9974-d582-4028-a152-6b51e10fd176', '005d7b3f-3466-47db-ab7b-911c1c7fa85b', '00bd8674-fd3a-4967-af16-3378dad9f0dd', '00c2d1ed-2fea-46a9-8885-f5cf1698655c', '00cd8eb4-0e1b-4601-a419-966b93ec5d2b', '0100769f-bc99-466f-a26b-284d521a063a', '012ca363-7f40-414d-80ec-467c0de824f9', '014df6ce-b688-44f4-a596-562424e55d41', '017b8124-1402-421b-ad1f-04fd570257d8', '01b0f9e8-5ba0-4ab1-bcac-f6fade4a6ef1', '01b623c2-21c8-4770-8774-de18fed56574', '01bdfdaf-54c5-4c68-b48c-40eece6c8f56', '01e3fd35-d7e8-45d2-8a40-efc13b7c3a8d', '02069a0c-64f8-48e8-ad9e-e137ae239783', '02765490-1d3c-411e-82ca-acd93ab210a2', '02c940c3-5a31-4a22-b9bd-d9b8b889c8ba', '0326aa5e-c2a9-4b3a-b560-3c3ba5265b68', '03788adb-3ef3-44e0-83b5-aed483536c08', '03cec05d-b088-41c2-8bf1-fcfbe2ef1b71']


In [96]:
def answer(query):
  # retrieving act name
  doc, score = vectorDB.similarity_search_with_relevance_scores(
      f"{query}"
  )[0]

  print(score)

  if score < 0.70:
      return "I’m not confident which law your question relates to."


  filter_data = doc.metadata["doc_title"]  # metadata based filter
  print(filter_data)

  # search for relevant chunks from documents
  relevant_chunks = vectorDB.similarity_search(
     query,
     k=10, 
     filter={"doc_title": f"{filter_data}"}
  )

  # creating langchain components

  model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
  parser = StrOutputParser()
  template = """
            You are a legal assistant specialized in Canadian federal legislation.

            You will be given:
            1. A user question about Canadian Acts or Regulations.
            2. A set of retrieved document excerpts (“context”).

            Your job:
            - Use ONLY the information found in the provided context to answer the question.
            - If the context does not contain enough information to answer confidently, say:
            “I cannot answer based on the retrieved documents.”
            - Do NOT invent facts, interpretations, penalties, definitions, or legal rules that are not explicitly stated in the context.
            - Keep the answer concise, neutral, and legally accurate.
            - If the user asks for advice, warnings, or interpretations beyond the text, respond with:
            “I can only provide information found in the retrieved documents.”

            Formatting rules:
            - Answer in 3–6 sentences unless the question requires more.
            - Cite relevant context sections using short quotes if helpful.

            Now answer the user question based on the context below.

            ---------------------
            CONTEXT:
            {context}
            ---------------------

            QUESTION:
            {question}

            ANSWER:
            """
  
  prompt = ChatPromptTemplate.from_template(template)

  chain = prompt | model | parser
  answer = chain.invoke({
      "question": f"{query}", 
      "context": "\n\n".join([chunk.page_content for chunk in relevant_chunks])
  })

#   print("\n\n".join([chunk.page_content for chunk in relevant_chunks]))
  return answer

In [97]:
query = "Under what circumstances can a federal institution refuse access to records on the grounds that they contain information obtained in confidence from another government??"

print(answer(query))

0.825123787
Access to Information Act
A federal institution can refuse access to records that contain information obtained in confidence from another government if the government, organization, or institution from which the information was obtained does not consent to the disclosure, or if the information has not been made public by them. This refusal is in accordance with the exemptions outlined in subsection 13(1) of the legislation.


In [68]:
print(doc_titles[0:25])

['Access to Information Act', 'Accessible Canada Act', 'Addition of Lands to Reserves and Reserve Creation Act', 'Administrative Tribunals Support Service of Canada Act', 'Aeronautics Act', 'Agreements and Conventions', 'Agricultural and Rural Development Act (ARDA)', 'Agricultural Marketing Programs Act', 'Agricultural Products Marketing Act', 'Agriculture and Agri-Food Administrative Monetary Penalties Act', 'Air Canada Public Participation Act', 'Air Travellers Security Charge Act', 'Airport Transfer (Miscellaneous Matters) Act', 'Alberta Natural Resources Act', 'Alternative Fuels Act', 'Animal Pedigree Act', 'Anishinabek Nation Education Agreement Act', 'Anishinabek Nation Governance Agreement Act', 'Annulment of Marriages Act (Ontario)', 'Antarctic Environmental Protection Act', 'Anti-Personnel Mines Convention Implementation Act', 'Anti-terrorism Act', 'Apprentice Loans Act', 'Appropriation Acts', 'Arctic Waters Pollution Prevention Act']


In [None]:
# check if query is returning proper answers

In [None]:
# use parallelization to repeat above process with more documents

In [149]:
with open("data/names.txt", "a") as file, open("data/links.txt", "a") as file1:
  for i in range(3):  # how many acts/regulations are being looped through
    name = doc_titles[i]
    link = doc_links[i]

    scrapable_link = ""
    try:
      # if the page can be found, create soup
      scrapable_link = requests.get(f"{link[:-11]}/FullText.html")  # getting rid of /.index.html from the end and replacing with /FullText.html
      scrapable_link.encoding = scrapable_link.apparent_encoding  # utf-8 encoding
      scrapable_soup = BeautifulSoup(scrapable_link.text, "html.parser")


      # take doc_title and generate description 
      model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
      parser = StrOutputParser()
      template = """
                You are a legal summarization assistant specialized in Canadian law. 
                Your task is to write a short, plain-language description of the following legal document. 
                The goal is to help ordinary Canadians understand what the document is about.

                Document Name: {doc_name}

                Instructions:
                - Write in clear, everyday English (no legal jargon).
                - Keep it concise: 2–4 sentences.
                - Focus on what the Act or Regulation does, its purpose, and who it applies to.
                - Do not include section numbers or citations.
                - Output only the summary.

                Summary:
                """
      
      prompt = ChatPromptTemplate.from_template(template)
      

      chain = prompt | model | parser
      description = chain.invoke({
          "doc_name": f"{doc_titles[i]}"
      })

      print(description)
      


      # combine with metadata and put it into vector store

      file.write(f"{doc_titles[i]}\n")  # append title to names.txt
      file1.write(f"{doc_links[i]}\n")  # append link to links.txt
    
      # write scraped data into associated file
      with open(f"data/scraped_content/{doc_titles[i].replace(' ', '_')}.txt", "a", encoding="utf-8") as file2:
        for item in scrapable_soup.find_all(["p", "h2", "h3", "h4"]):
          if (item.name == "p"):
            for descendant in item.descendants:
              if isinstance(descendant, NavigableString):
                if str(descendant) != "Marginal note:":
                  file2.write(descendant)
            file2.write("\n")
          
          elif (item.name == "h2"):
            try:
              if "Part" in item["class"]:
                for descendant in item.descendants:
                  if isinstance(descendant, NavigableString):
                    if str(descendant) != "Marginal note:":
                      file2.write(descendant)
                file2.write("\n")
            except:
              continue
          
          elif (item.name ==  "h3" or item.name == "h4"):
            try:
              if "Subheading" in item["class"]:
                for descendant in item.descendants:
                  if isinstance(descendant, NavigableString):
                    if str(descendant) != "Marginal note:":
                      file2.write(descendant)
                file2.write("\n")
            except:
              continue

    except:
      continue

The Access to Information Act is a Canadian law that allows individuals to request information from federal government institutions. This Act aims to promote transparency and accountability by giving Canadians the right to access government records and documents. It applies to all federal government departments, agencies, and Crown corporations.
The Accessible Canada Act is a law in Canada that aims to make the country more accessible to people with disabilities. It sets out requirements for organizations to identify, remove, and prevent barriers that could prevent people with disabilities from fully participating in society. The Act applies to federal government entities, federally regulated industries, and organizations under federal jurisdiction.
The Addition of Lands to Reserves and Reserve Creation Act is a law that allows Indigenous communities in Canada to expand their reserves by adding more land to them or creating new reserves altogether. This Act aims to support Indigenous s

In [109]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [111]:
model.invoke("What MLB team won the world series this year?")

AIMessage(content='The Atlanta Braves won the World Series in 2021.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 12, 'prompt_tokens': 17, 'total_tokens': 29, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-c17a57e6-9e68-4023-ad74-8acfcc5bf28c-0', usage_metadata={'input_tokens': 17, 'output_tokens': 12, 'total_tokens': 29})