In [1]:
#import langchain_aws as aws
#model = aws.ChatBedrock(model_id="anthropic.claude-3-haiku-20240307-v1:0")
#answer = model.invoke("How are you ? ")
#print(answer.content)

In [None]:
import boto3
import numpy as np
from langchain_community.vectorstores import Chroma
import shutil
import sys
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings.bedrock import BedrockEmbeddings

import warnings
warnings.filterwarnings("ignore")

In [3]:
DATABASE_PATH = os.environ.get("DATABASE_PATH", "data/database")
DATA_PATH = os.environ.get("DATABASE_PATH", "data/docs")

In [14]:
# Embedding function using Bedrock embeddings
def get_embedding_function():
    embeddings = BedrockEmbeddings()
    return embeddings

# Function to split documents into chunks
def split_documents(documents: list[Document]):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=10000,
        chunk_overlap=80,
        length_function=len
    )
    return splitter.split_documents(documents)

# Function to calculate unique chunk IDs for documents
def calculate_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the chunk metadata
        chunk.metadata["id"] = chunk_id

    return chunks

# Function to generate the database from the documents
def generate_database():
    # Load the documents
    loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = loader.load()
    
    # Split the documents into chunks
    documents = split_documents(documents)
    
    # Initialize the Chroma database
    database = Chroma(
        persist_directory=DATABASE_PATH,
        embedding_function=get_embedding_function()
    )
    
    # Calculate IDs for chunks
    chunks_with_ids = calculate_chunk_ids(documents)
    existing_items = database.get(include=[])  # Retrieve existing document IDs
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Add only new documents to the database
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if new_chunks:
        print(f"Adding new documents to the database: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        database.add_documents(new_chunks, ids=new_chunk_ids)
        database.persist()
    else:
        print("All the documents are added")
    
    return documents

# Run the function
generate_database()

Number of existing documents in DB: 353
All the documents are added


[Document(metadata={'source': 'data/docs/2018-rapport-annuel-bce.pdf', 'page': 0, 'id': 'data/docs/2018-rapport-annuel-bce.pdf:0:0'}, page_content='Aujourd’hui avec /  \n  la vie est Bell.\n la ﬁbre jusqu’au domicile / Crave / \nl’Internet rural / le service à la clientèle / Alt Télé / \nles réseaux sans ﬁl / les solutions de collaboration /  \nles villes intelligentes / la vidéo en continu / le hockey / \nle service Wi-Fi Partout chez vous / Bell Télé Fibe / \nle basketball / la diversité et l’inclusion / les réseaux \nvirtuels / la portée publicitaire / les services d’itinérance /  \nla plus vaste conversation sur la santé mentale / l’IdO / \nles voitures connectées / les services gérés / le football / \nla R et D / la programmation locale / le libre-service / \nl’amélioration du réseau au Manitoba / la radio en continu /  \nles services sans ﬁl prépayés / la sécurité d’entreprise / \nl’efﬁcacité en affaires / la sécurité publique / le soccer / \nles actualités, les sports et le dive

In [13]:
from botocore.exceptions import ClientError
from langchain.prompts import ChatPromptTemplate
# Embedding function using Bedrock embeddings
def get_embedding_function():
    return BedrockEmbeddings()

PROMPT_TEMPLATE = """
                    <meta>
                    current year: 2024
                    role: financial analyst
                    language: english
                    expertise: finance, annual reports, financial performance
                    tone: professional, concise, objective
                    response style: factual, summary-focused
                    </meta>

                    You are a financial analyst. Answer the following question in English, using only the information from the provided context.

                    Question: {question}

                    Context:  {context}

                    Answer in a clear and concise manner, focusing on key financial insights.
                    """

# Initialize the Chroma database with the embedding function instance
embedding_function_instance = get_embedding_function()
db = Chroma(persist_directory=DATABASE_PATH, embedding_function=embedding_function_instance)

# Query and generate context for the prompt
query_text = "Combien Bell Canada a tiré un produit brut totalisant à la suite de l'émission de débentures à moyen terme (MTN) de 7 ans et de 10 ans ?"


results = db.similarity_search_with_score(query_text, k=5)

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)

# Prepare the conversation payload for Bedrock
conversation = [
    {
        "role": "user",
        "content": [{"text": prompt}]  # Wrap the prompt in a list of dictionaries
    }
]

# Set up the model client
model_id = "anthropic.claude-3-haiku-20240307-v1:0"
client = boto3.client("bedrock-runtime", region_name="us-west-2")

try:
    # Send the message to the model, using a basic inference configuration
    response = client.converse(
                modelId=model_id,
                messages=conversation,
                inferenceConfig={"maxTokens": 4000, "temperature": 1},
                additionalModelRequestFields={"top_k": 250, "top_p": 1},
    )

    # Extract and print the response text
    response_text = response["output"]["message"]["content"][0]["text"]
    #print(response_text)

    # Format the sources and print
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

except (ClientError, Exception) as e:
    print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
    exit(1)

Response: Selon les informations fournies dans le rapport annuel 2019 de BCE Inc., Bell Canada a émis avec succès des débentures à moyen terme (MTN) d'une durée de 7 ans et de 10 ans, totalisant un produit brut de 1,15 milliard de dollars canadiens.

Spécifiquement:

- Le 10 septembre 2019, Bell Canada a émis des MTN, série M-50, à 2,90%, d'un capital de 550 millions de dollars, échéant le 10 septembre 2029. 

- Le 13 mai 2019, Bell Canada a émis des MTN, série M-49, à 2,75%, d'un capital de 600 millions de dollars, échéant le 29 janvier 2025.

Donc, le produit brut total tiré de l'émission de ces deux séries de débentures MTN de 7 ans et 10 ans s'élève à 1,15 milliard de dollars canadiens.
Sources: ['data/docs/2019-rapport-annuel-bce.pdf:146:0', 'data/docs/2018-rapport-annuel-bce.pdf:157:0', 'data/docs/2019-rapport-annuel-bce.pdf:36:0', 'data/docs/2018-rapport-annuel-bce.pdf:155:0', 'data/docs/2018-rapport-annuel-bce.pdf:38:0']
