In [1]:
# importing
from bs4 import BeautifulSoup, SoupStrainer, NavigableString
import requests
import os
from dotenv import load_dotenv

# langchain
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

In [2]:
# creating scrapable objects from 2 landing pages (acts and regulations)

landing_pages = ["https://laws-lois.justice.gc.ca/eng/acts/", "https://laws-lois.justice.gc.ca/eng/regulations/"]

# creating a list of doc titles
doc_titles_set = set()

doc_titles = []
doc_links = []

for landing_page in landing_pages:
  # creating a soup for each page
  page = requests.get(landing_page)
  soup = BeautifulSoup(page.text, features="html.parser")


  alpha_link = ""

  # entering all links for alphabetical act/regulation pages into list
  for item in soup.find_all("a"):
    try:
      if item["class"] == ["btn", "btn-default"]:
        alpha_page = item["href"]
        alpha_link = f"{landing_page}{alpha_page}"

        # soup alphabetical link
        alpha_page = requests.get(alpha_link)
        alpha_soup = BeautifulSoup(alpha_page.text, features="html.parser")

        # soup for each individual act/regulation from alphabetical link
        for item in alpha_soup.find_all("a"):
          if "class" in item.attrs.keys():
            if item.attrs["class"] == ["TocTitle"]:
              if item.string != None and (str(item.string[2:-2]) not in doc_titles_set):
                doc_titles_set.add(str(item.string[2:-2]))
                doc_titles.append(str(item.string[2:-2]))
                doc_links.append(f"{landing_page}{item.get('href')}")
    except: 
      continue


In [3]:
doc_titles.append("The Constitution Acts 1867 to 1982")
doc_links.append("https://laws-lois.justice.gc.ca/eng/Const/index.html")

In [4]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [8]:
descriptions = []
metadatas = []

for i in range(25):
  name = doc_titles[i]
  link = doc_links[i]


  # take doc_title and generate description 
  model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
  parser = StrOutputParser()
  template = """
            You are a legal summarization assistant specialized in Canadian law. 
            Your task is to write a short, plain-language description of the following legal document. 
            The goal is to help ordinary Canadians understand what the document is about.

            Document Name: {doc_name}

            Instructions:
            - Write in clear, everyday English (no legal jargon).
            - Keep it concise: 2–4 sentences.
            - Focus on what the Act or Regulation does, its purpose, and who it applies to.
            - Do not include section numbers or citations.
            - Output only the summary.

            Summary:
            """
  
  prompt = ChatPromptTemplate.from_template(template)
  

  chain = prompt | model | parser
  description = chain.invoke({
      "doc_name": f"{doc_titles[i]}"
  })


  descriptions.append(description)
  metadata = {}

  metadata["doc_title"] = doc_titles[i]
  metadata["doc_link"] = doc_links[i]
  metadata["description"] = True

  metadatas.append(metadata)


print(descriptions)
print(metadatas)

['The Access to Information Act is a Canadian law that allows individuals to request information from federal government institutions. This Act aims to promote transparency and accountability by giving Canadians the right to access government records. It applies to everyone who wants to obtain information held by federal government departments and agencies.', 'The Accessible Canada Act is a law in Canada that aims to make sure that everyone, including people with disabilities, can fully participate in society. It requires organizations under federal jurisdiction to identify, remove, and prevent barriers that might prevent people with disabilities from accessing goods, services, employment, and facilities. This Act applies to federal government departments, agencies, and organizations, as well as federally regulated industries like banking, telecommunications, and transportation.', 'The Addition of Lands to Reserves and Reserve Creation Act is a law that allows Indigenous communities in

In [None]:
# creating pinecone index

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

# Delete old index
pc.delete_index("bettercallpaul-index")

# Create new index with correct dim
pc.create_index(
    name="bettercallpaul-index",
    dimension=3072,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [22]:
from langchain_core.documents import Document
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

# creating list of document objects
documents = []

for i in range(len(descriptions)):
  documents.append(Document(page_content=descriptions[i], metadata=metadatas[i]))


# creating embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# storing into vectorestore
index_name = "bettercallpaul-index"

vectorDB = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)

In [26]:
# test if prompt is retreiving correct document name from vector store
vectorDB.similarity_search("What should i do if i want to ask the federal government questions regarding their secret documents?")[:3]

[Document(metadata={'description': True, 'doc_link': 'https://laws-lois.justice.gc.ca/eng/acts/A-1/index.html', 'doc_title': 'Access to Information Act'}, page_content='The Access to Information Act is a Canadian law that allows individuals to request information from federal government institutions. This Act aims to promote transparency and accountability by giving Canadians the right to access government records. It applies to everyone who wants to obtain information held by federal government departments and agencies.'),
 Document(metadata={'description': True, 'doc_link': 'https://laws-lois.justice.gc.ca/eng/acts/A-1.5/index.html', 'doc_title': 'Administrative Tribunals Support Service of Canada Act'}, page_content='The Administrative Tribunals Support Service of Canada Act establishes a service that provides support to various federal administrative tribunals. This Act aims to improve the efficiency and effectiveness of these tribunals by offering centralized administrative servic

In [None]:
# for i in range(25):
#   put each document into temp file
#   chunk from the temp file
#   insert chunks into vector database along with metadata


In [None]:
# use parallelization to repeat above process with more documents

In [149]:
with open("data/names.txt", "a") as file, open("data/links.txt", "a") as file1:
  for i in range(3):  # how many acts/regulations are being looped through
    name = doc_titles[i]
    link = doc_links[i]

    scrapable_link = ""
    try:
      # if the page can be found, create soup
      scrapable_link = requests.get(f"{link[:-11]}/FullText.html")  # getting rid of /.index.html from the end and replacing with /FullText.html
      scrapable_link.encoding = scrapable_link.apparent_encoding  # utf-8 encoding
      scrapable_soup = BeautifulSoup(scrapable_link.text, "html.parser")


      # take doc_title and generate description 
      model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
      parser = StrOutputParser()
      template = """
                You are a legal summarization assistant specialized in Canadian law. 
                Your task is to write a short, plain-language description of the following legal document. 
                The goal is to help ordinary Canadians understand what the document is about.

                Document Name: {doc_name}

                Instructions:
                - Write in clear, everyday English (no legal jargon).
                - Keep it concise: 2–4 sentences.
                - Focus on what the Act or Regulation does, its purpose, and who it applies to.
                - Do not include section numbers or citations.
                - Output only the summary.

                Summary:
                """
      
      prompt = ChatPromptTemplate.from_template(template)
      

      chain = prompt | model | parser
      description = chain.invoke({
          "doc_name": f"{doc_titles[i]}"
      })

      print(description)
      


      # combine with metadata and put it into vector store

      file.write(f"{doc_titles[i]}\n")  # append title to names.txt
      file1.write(f"{doc_links[i]}\n")  # append link to links.txt
    
      # write scraped data into associated file
      with open(f"data/scraped_content/{doc_titles[i].replace(' ', '_')}.txt", "a", encoding="utf-8") as file2:
        for item in scrapable_soup.find_all(["p", "h2", "h3", "h4"]):
          if (item.name == "p"):
            for descendant in item.descendants:
              if isinstance(descendant, NavigableString):
                if str(descendant) != "Marginal note:":
                  file2.write(descendant)
            file2.write("\n")
          
          elif (item.name == "h2"):
            try:
              if "Part" in item["class"]:
                for descendant in item.descendants:
                  if isinstance(descendant, NavigableString):
                    if str(descendant) != "Marginal note:":
                      file2.write(descendant)
                file2.write("\n")
            except:
              continue
          
          elif (item.name ==  "h3" or item.name == "h4"):
            try:
              if "Subheading" in item["class"]:
                for descendant in item.descendants:
                  if isinstance(descendant, NavigableString):
                    if str(descendant) != "Marginal note:":
                      file2.write(descendant)
                file2.write("\n")
            except:
              continue

    except:
      continue

The Access to Information Act is a Canadian law that allows individuals to request information from federal government institutions. This Act aims to promote transparency and accountability by giving Canadians the right to access government records and documents. It applies to all federal government departments, agencies, and Crown corporations.
The Accessible Canada Act is a law in Canada that aims to make the country more accessible to people with disabilities. It sets out requirements for organizations to identify, remove, and prevent barriers that could prevent people with disabilities from fully participating in society. The Act applies to federal government entities, federally regulated industries, and organizations under federal jurisdiction.
The Addition of Lands to Reserves and Reserve Creation Act is a law that allows Indigenous communities in Canada to expand their reserves by adding more land to them or creating new reserves altogether. This Act aims to support Indigenous s

In [109]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [111]:
model.invoke("What MLB team won the world series this year?")

AIMessage(content='The Atlanta Braves won the World Series in 2021.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 12, 'prompt_tokens': 17, 'total_tokens': 29, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-c17a57e6-9e68-4023-ad74-8acfcc5bf28c-0', usage_metadata={'input_tokens': 17, 'output_tokens': 12, 'total_tokens': 29})