In [None]:
# RUN
# importing
from bs4 import BeautifulSoup, SoupStrainer, NavigableString
import requests
import os
from dotenv import load_dotenv

# langchain
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

In [None]:
# RUN
# loading OPENAI API KEY

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
# DO NOT RUN
# creating pinecone index

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

# Delete old index
pc.delete_index("bettercallpaul-index")

# Create new index with correct dim
pc.create_index(
    name="bettercallpaul-index",
    dimension=3072,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [None]:
# RUN
# creating scrapable objects from 2 landing pages (acts and regulations)
# retrievingall document pages -->  titles and links

landing_pages = ["https://laws-lois.justice.gc.ca/eng/acts/", "https://laws-lois.justice.gc.ca/eng/regulations/"]

# creating a list of doc titles
doc_titles_set = set()

doc_titles = []
doc_links = []

for landing_page in landing_pages:
  # creating a soup for each page
  page = requests.get(landing_page)
  soup = BeautifulSoup(page.text, features="html.parser")


  alpha_link = ""

  # entering all links for alphabetical act/regulation pages into list
  for item in soup.find_all("a"):
    try:
      if item["class"] == ["btn", "btn-default"]:
        alpha_page = item["href"]
        alpha_link = f"{landing_page}{alpha_page}"

        # soup alphabetical link
        alpha_page = requests.get(alpha_link)
        alpha_soup = BeautifulSoup(alpha_page.text, features="html.parser")

        # soup for each individual act/regulation from alphabetical link
        for item in alpha_soup.find_all("a"):
          if "class" in item.attrs.keys():
            if item.attrs["class"] == ["TocTitle"]:
              if item.string != None and (str(item.string[2:-2]) not in doc_titles_set):
                doc_titles_set.add(str(item.string[2:-2]))
                doc_titles.append(str(item.string[2:-2]))
                doc_links.append(f"{landing_page}{item.get('href')}")
    except: 
      continue


In [None]:
# RUN
# adding constitution document to titles and links
doc_titles.append("The Constitution Acts 1867 to 1982")
doc_links.append("https://laws-lois.justice.gc.ca/eng/Const/index.html")

In [None]:
# NO AFFECT TO VDB IF RAN, BUT NO POINT IN RUNNING
# ONLY RUN TO GENERATE DESCRIPTIONS FOR NEW DOCUMENTS
# creating descriptions for each law document and pairing it with metadata to be inserted in vector database

descriptions = []
metadatas = []

for i in range(25):
  name = doc_titles[i]
  link = doc_links[i]


  # take doc_title and generate description 
  model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
  parser = StrOutputParser()
  template = """
            You are a legal summarization assistant specialized in Canadian law. 
            Your task is to write a short, plain-language description of the following legal document. 
            The goal is to help ordinary Canadians understand what the document is about.

            Document Name: {doc_name}

            Instructions:
            - Write in clear, everyday English (no legal jargon).
            - Keep it concise: 2â€“4 sentences.
            - Focus on what the Act or Regulation does, its purpose, and who it applies to.
            - Do not include section numbers or citations.
            - Output only the summary.

            Summary:
            """
  
  prompt = ChatPromptTemplate.from_template(template)
  

  chain = prompt | model | parser
  description = chain.invoke({
      "doc_name": f"{doc_titles[i]}"
  })


  descriptions.append(description)
  metadata = {}

  metadata["doc_title"] = doc_titles[i]
  metadata["doc_link"] = doc_links[i]
  metadata["description"] = True

  metadatas.append(metadata)

['The Access to Information Act is a Canadian law that allows individuals to request access to government records. This Act aims to promote transparency and accountability within the government by giving citizens the right to access information held by federal institutions. It applies to all Canadian citizens, permanent residents, and anyone present in Canada.', 'The Accessible Canada Act is a law in Canada that aims to make sure that everyone, including people with disabilities, can fully participate in society. It sets out rules for organizations to follow to make their services and facilities more accessible. This Act applies to federal government organizations and industries under federal jurisdiction.', 'The Addition of Lands to Reserves and Reserve Creation Act is a law that allows Indigenous communities in Canada to expand their existing reserves or create new ones. This Act helps Indigenous groups secure more land for their communities, better meet their needs, and support thei

In [None]:
# DO NOT RUN

# creating documents objects for descriptions by combining with metadata
from langchain_core.documents import Document
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

# creating list of document objects (insertable in vector database)
documents = []

for i in range(len(descriptions)):
  documents.append(Document(page_content=descriptions[i], metadata=metadatas[i]))



# insertion into vdb

# creating embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# storing into vectorestore
index_name = "bettercallpaul-index"

vectorDB = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)

In [None]:
# DO NOT RUN
# chunk and insert files into vdb alongside metadata

for i in range(25):
  #   put each document into temp file
  name = doc_titles[i]  # metadata usage
  link = doc_links[i]  # metadata usage
  try:
    # if the page can be found, create soup
    scrapable_link = requests.get(f"{link[:-11]}/FullText.html")  # getting rid of /.index.html from the end and replacing with /FullText.html
    scrapable_link.encoding = scrapable_link.apparent_encoding  # utf-8 encoding
    scrapable_soup = BeautifulSoup(scrapable_link.text, "html.parser")


    # write scraped data into temporary file
    with open(f"data/scraped_content/temp_file.txt", "w", encoding="utf-8") as file:
        for item in scrapable_soup.find_all(["p", "h2", "h3", "h4"]):
          if (item.name == "p"):
            for descendant in item.descendants:
              if isinstance(descendant, NavigableString):
                if str(descendant) != "Marginal note:":
                  file.write(descendant)
          
          elif (item.name == "h2"):
            try:
              if "Part" in item["class"]:
                for descendant in item.descendants:
                  if isinstance(descendant, NavigableString):
                    if str(descendant) != "Marginal note:":
                      file.write(descendant)
            except:
              continue
          
          elif (item.name ==  "h3" or item.name == "h4"):
            try:
              if "Subheading" in item["class"]:
                for descendant in item.descendants:
                  if isinstance(descendant, NavigableString):
                    if str(descendant) != "Marginal note:":
                      file.write(descendant)
            except:
              continue
    

    #   chunk from the temp file

    from langchain_community.document_loaders.text import TextLoader

    loader = TextLoader("data/scraped_content/temp_file.txt",
                        encoding="utf-8",
                        autodetect_encoding=True)

    text_documents = loader.load()

    
    from langchain.text_splitter import RecursiveCharacterTextSplitter

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1400, chunk_overlap=525)
    chunks = text_splitter.split_documents(text_documents)

  
    # modify chunks list to insert proper metadata
    for j in range(len(chunks)):
      chunks[j].metadata = {
        "doc_title":name,
        "doc_link":link,
        "description": False
      }


    #   insert chunks into vector database along with metadata
    vectorDB = PineconeVectorStore.from_documents(
    chunks, embeddings, index_name=index_name
    )
  
  except Exception as e:
    print(f"Error on {name}: {e}")
    continue  # move onto next document