In [1]:
import boto3
import numpy as np
from langchain_community.vectorstores import Chroma
import shutil
import sys
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings.bedrock import BedrockEmbeddings

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import BedrockEmbeddings
from langchain.vectorstores import Chroma

from botocore.exceptions import ClientError
from langchain.prompts import ChatPromptTemplate

import json

import psycopg2
import warnings
import pickle

from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")

In [2]:
DATABASE_PATH = os.environ.get("DATABASE_PATH", "../data/database")
DATA_PATH = os.environ.get("DATABASE_PATH", "../data/docs")

In [3]:
import glob
import os

# Path to the directory
directory_path = '../data/docs'

# Find all PDF files recursively
pdf_files = glob.glob(os.path.join(directory_path, '**', '*.pdf'), recursive=True)

In [21]:
def cosine_similarity(a, b):
    """
    Compute the cosine similarity between two vectors.
    
    Parameters:
    a (numpy.ndarray): First vector.
    b (numpy.ndarray): Second vector.
    
    Returns:
    float: Cosine similarity between the two vectors.
    """
    # Compute the dot product of the two vectors
    dot_product = np.dot(a, b)
    
    # Compute the norms (magnitudes) of the vectors
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    # Avoid division by zero
    if norm_a == 0 or norm_b == 0:
        return 0.0
    
    # Compute cosine similarity
    return dot_product / (norm_a * norm_b)



In [77]:
def query_llm(conversation, client, model_id, max_tokens):
    try:
        # Send the message to the model, using a basic inference configuration
        response = client.converse(
                    modelId=model_id,
                    messages=conversation,
                    inferenceConfig={"maxTokens": max_tokens, "temperature": 1},
                    additionalModelRequestFields={"top_k": 250, "top_p": 1},
        )

        # Extract and print the response text
        return response["output"]["message"]["content"][0]["text"]
        #print(response_text)

    except (ClientError, Exception) as e:
        print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
        exit(1)

def extract_name_year(splits, client, model_id, n_page = 3):

    doc_identifier = [*splits[:n_page], *splits[-n_page:]]
    doc_identifier = "\n\n---\n\n".join([doc.page_content for doc in doc_identifier])

    prompt_text = f"""

    Question : What is the name of the company, what financial year this report is about?

    Output : only json string format with two keys (company, year)

    Error : if you could not find anything the values in json should be empty strings

    Context : {doc_identifier}

    """

    conversation = [
            {
                "role": "user",
                "content": [{"text": prompt_text}]  # Wrap the prompt in a list of dictionaries
            }
        ]

    response = query_llm(conversation, client, model_id)
    return response

def add_name_year_tags(split_documents, info_dict):
    embedder = BedrockEmbeddings()
    modified_docs = []
    for doc in tqdm(split_documents):
        if info_dict['company'] and info_dict['year']:
            modified_content = f"""
                <company> {info_dict['company']} <company>
                <year> {info_dict['company']} <year>

                {doc.page_content}

                <company> {info_dict['company']} <company>
                <year> {info_dict['company']} <year>
                """
        else:
            modified_content = doc.page_content

        modified_docs.append(dict(company = info_dict['company'], year = info_dict['year'], embedding = embedder.embed_query(modified_content), content = modified_content))

    return modified_docs

In [45]:
model_id = "anthropic.claude-3-haiku-20240307-v1:0"
client = boto3.client("bedrock-runtime", region_name="us-west-2")
embedder = BedrockEmbeddings()

In [17]:
pdf_path = pdf_files[5]

loader = PyPDFLoader(pdf_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=5000)
split_documents = text_splitter.split_documents(documents)

info_dict = extract_name_year(split_documents, client, model_id, n_page = 3)

info_dict = json.loads(info_dict)

docs = add_name_year_tags(split_documents, info_dict)

  0%|          | 0/218 [00:00<?, ?it/s]

In [86]:
with open("../data/database/db.pkl", "rb") as file:
    docs = pickle.load(file)

set([(d['company'], d['year']) for d in docs])

{('ALTAGAS', '2022'),
 ('Alimentation Couche-Tard', '2022'),
 ('Alimentation Couche-Tard Inc.', '2018'),
 ('Alimentation Couche-Tard Inc.', '2019'),
 ('Alimentation Couche-Tard Inc.', '2024'),
 ('Alimentation Couche-Tard inc.', '2020'),
 ('Alimentation Couche-Tard inc.', '2021'),
 ('Alimentation Couche-Tard inc.', '2023'),
 ('AltaGas Ltd.', '2020'),
 ('AltaGas Ltd.', '2021'),
 ('AltaGas Ltd.', '2023'),
 ('BCE Inc.', '2018'),
 ('BCE Inc.', '2019'),
 ('BCE Inc.', '2020'),
 ('BCE Inc.', '2021'),
 ('BCE Inc.', '2022'),
 ('BCE Inc.', '2023'),
 ('COGECO COMMUNICATIONS INC.', '2020'),
 ('CP', '2020'),
 ('Canadian Pacific', '2018'),
 ('Canadian Pacific', '2021'),
 ('Canadian Pacific', '2022'),
 ('Canadian Pacific Kansas City Limited', '2023'),
 ('Canadian Pacific Railway Limited', '2019'),
 ('CogeCo CommuniCations inC', '2018'),
 ('Cogeco Communications Inc.', '2019'),
 ('Cogeco Communications Inc.', '2021'),
 ('Cogeco Communications Inc.', '2022'),
 ('Cogeco Communications inc.', '2023'),
 ('

In [91]:
documents = [d for d in docs if d['company'] == 'Fortis Inc.' and d['year'] == '2021']
len(documents)

89

In [96]:
query_chunk = """
basic financial indicators

Turnover, gross margin, free cash flow, net debt, profit (before interest, taxes, depreciation and amortization), earnings per share

"""

embedding_query_chunk = embedder.embed_query(query_chunk)

sim = [cosine_similarity(embedding_query_chunk, d['embedding']) for d in documents]

relevant_docs = "\n\n ---- \n\n".join([documents[i]['content'] for i in np.argsort(sim)[-50:][::-1]])


prompt_text = f"""

    RESPONSE LANGUAGE : please response in english

    TASK : retrieve basic financial indicators

    VALUES : Turnover, gross margin, free cash flow, net debt, profit (before interest, taxes, depreciation and amortization), earnings per share

    OUTPUT FORMAT : put everything in a table. how the values has changed comparing to previous years
    
    indicate the unit of the values and put NA for values that are not available

    Context : {relevant_docs}
    """

conversation = [{
            "role": "user",
            "content": [{"text": prompt_text}]  # Wrap the prompt in a list of dictionaries
    }]

response_llm = query_llm(conversation, client, model_id, max_tokens = 4096)

In [97]:
print(response_llm)

Here is the table with the financial indicators you requested, along with how the values have changed compared to previous years and the unit of the values. I have indicated NA for values that are not available.

Financial Indicator	2021	Variance compared to previous year
Turnover	$9,448 million	Increased by $513 million compared to 2020
Gross margin	NA	NA
Free cash flow	$2,907 million	Increased by $206 million compared to 2020
Net debt	$25,784 million	Increased by $1,203 million compared to 2020
Profit (before interest, taxes, depreciation and amortization)	$2,469 million	Decreased by $39 million compared to 2020
Earnings per share	$2.61	Increased by $0.01 compared to 2020


In [94]:


# Paths and settings
# pdf_path = "../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf"
# database_path = "../data/database/db.sqlite3"

# Load and split the document
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_documents = text_splitter.split_documents(documents)

# Initialize Bedrock embeddings
embedding_model = BedrockEmbeddings()

# Initialize Chroma database
database = Chroma(
    persist_directory=database_path,
    embedding_function=BedrockEmbeddings()
)

# Process each document chunk
for doc in split_documents:
    # Generate the embedding for the chunk's content
    doc_embedding = embedding_model.embed_query(doc.page_content)
    
    # Update the document metadata with the embedding if needed
    doc.metadata["embedding"] = doc_embedding

    # Add document with embedding to the database
    database.add_documents([doc])

# Persist the database
database.persist()

print("Document chunks and embeddings stored successfully.")


ValueError: Expected metadata value to be a str, int, float or bool, got [0.86328125, 0.310546875, -0.0947265625, -0.00872802734375, 0.7890625, 0.390625, -0.03076171875, -0.0004730224609375, -0.001983642578125, -1.0703125, -0.2294921875, 0.0306396484375, -0.40625, 0.45703125, -0.796875, -0.107421875, 0.283203125, 0.439453125, -0.36328125, -0.396484375, -0.7578125, -0.03369140625, 0.671875, -0.58984375, 0.228515625, 0.60546875, 0.6875, -0.60546875, -0.201171875, -0.52734375, -0.0693359375, -0.423828125, 0.21875, -0.52734375, 0.049560546875, 0.058837890625, -0.08154296875, 0.337890625, -0.0869140625, -0.71875, -0.9296875, 0.0245361328125, 0.400390625, -0.201171875, -0.28125, 0.0036773681640625, -0.2265625, -0.37890625, 0.8125, -0.232421875, -0.23046875, -0.126953125, -0.203125, 0.279296875, 0.2158203125, -0.84765625, 0.39453125, 0.02001953125, 0.40234375, -0.27734375, 0.3671875, 0.43359375, 0.349609375, 0.466796875, 0.31640625, 0.0206298828125, -0.22265625, -0.15625, 0.026123046875, 0.4296875, -0.41796875, -0.78125, 0.283203125, -0.41796875, -0.031494140625, -0.1826171875, -0.1220703125, -0.287109375, 0.5625, -0.01177978515625, 0.62109375, 0.30078125, 0.65234375, 0.68359375, -0.369140625, -0.7265625, -0.490234375, -0.035400390625, 0.00011968612670898438, 0.080078125, 0.5234375, -0.765625, -0.578125, 0.034912109375, 0.4609375, -0.55078125, -0.498046875, -0.37109375, -0.1767578125, 0.486328125, -0.2236328125, -0.7109375, 0.1162109375, 0.50390625, 0.431640625, -0.359375, 0.08935546875, -0.12890625, 0.48046875, 0.427734375, -0.59375, -0.45703125, -0.609375, -0.76953125, -0.166015625, -0.10693359375, -0.08984375, -0.21875, -0.5, 0.404296875, -0.796875, -0.5, -0.0228271484375, -0.140625, 0.09716796875, 0.2412109375, -0.16015625, -0.671875, -0.37109375, -0.16796875, -0.0849609375, 0.384765625, 0.0167236328125, -0.09130859375, -0.447265625, -0.9921875, 0.029296875, -0.27734375, -0.5625, -0.055908203125, 0.10302734375, -0.1806640625, 0.46484375, -0.37109375, -0.07177734375, 1.078125, 0.259765625, -0.75390625, -0.30078125, -0.50390625, -0.09765625, -0.7109375, -0.11083984375, 0.40234375, -0.023681640625, -0.28515625, -0.228515625, -0.212890625, -1.171875, 0.369140625, -0.15625, 0.248046875, 0.6875, 0.1396484375, -0.2353515625, 0.0223388671875, 0.2734375, 0.01177978515625, -0.953125, -0.828125, 0.349609375, 0.024169921875, 0.97265625, 0.6484375, 0.02685546875, -0.091796875, 1.1015625, -0.458984375, -0.150390625, -0.63671875, 0.02099609375, 0.2216796875, -0.1357421875, -0.609375, -0.369140625, 0.1650390625, 0.85546875, -0.0595703125, 0.12890625, 0.27734375, -0.09228515625, -0.279296875, 0.79296875, -0.48828125, 0.2294921875, -0.205078125, 0.251953125, 0.1533203125, 0.248046875, 0.044921875, 0.134765625, -0.1181640625, 1.0859375, -0.7890625, 0.31640625, -0.419921875, 0.640625, 0.130859375, -0.53515625, -0.16015625, -0.48828125, -0.3515625, -0.09130859375, -0.357421875, 0.1533203125, -0.054443359375, 0.1953125, 0.4921875, -0.2216796875, 0.5625, -0.419921875, 0.73828125, -0.2216796875, -1.0546875, -0.1337890625, -0.2314453125, 0.1962890625, -0.2001953125, 0.546875, 0.71484375, 0.55078125, -0.765625, -0.69140625, 0.5078125, 0.11962890625, 0.490234375, 0.044921875, -0.48828125, 0.515625, -0.197265625, -0.30859375, -0.57421875, 0.255859375, 0.54296875, 0.060791015625, 0.59765625, 0.69140625, 0.10546875, -0.2177734375, 0.52734375, 0.37890625, 0.1630859375, -0.91015625, 0.265625, 0.294921875, -0.361328125, -0.80859375, -0.84375, -0.326171875, -0.08740234375, -0.8203125, 0.61328125, 0.3359375, 0.31640625, 0.0908203125, 0.5390625, -0.376953125, 0.74609375, 0.10498046875, 0.84765625, 9.632110595703125e-05, 0.0703125, 0.263671875, 0.00799560546875, 0.2421875, -0.412109375, 0.171875, 0.404296875, 0.34765625, -0.0186767578125, 0.27734375, -0.9375, -0.765625, 0.8515625, 0.431640625, -0.5546875, 0.006500244140625, 0.015869140625, 0.71484375, -0.458984375, 0.5, 0.482421875, 0.18359375, 0.322265625, 0.10205078125, 0.158203125, -0.45703125, 0.4375, 0.095703125, 0.2041015625, -0.1982421875, -0.65234375, -0.8359375, -0.341796875, -0.0179443359375, -0.177734375, 0.134765625, 0.71484375, -0.330078125, 0.2890625, -0.26171875, 0.2890625, 0.287109375, 0.8046875, 0.1875, -0.484375, 0.1875, -0.1826171875, -0.2060546875, 0.55859375, -0.2412109375, 0.0162353515625, 0.00811767578125, -0.296875, -0.234375, 0.053955078125, 1.3125, 0.416015625, -0.291015625, -0.00102996826171875, -0.1474609375, -0.1494140625, 0.022705078125, 0.419921875, 0.05322265625, -0.002777099609375, -0.28515625, 0.53515625, 0.515625, -1.3984375, -0.39453125, 0.4296875, 0.041748046875, 0.296875, 0.25, -0.1572265625, 0.0255126953125, 0.109375, -0.0194091796875, 0.263671875, -0.515625, 0.02685546875, 0.04248046875, -0.287109375, -0.3828125, 0.8359375, 0.56640625, 0.50390625, 0.1484375, -0.4453125, 0.8203125, 0.54296875, 0.455078125, 0.28125, -0.031982421875, -0.00860595703125, -0.6328125, 0.2177734375, -0.11767578125, 0.376953125, -0.138671875, -0.39453125, 0.8828125, 0.8359375, -0.1884765625, 0.318359375, -0.55078125, 0.265625, 0.1796875, -0.138671875, 0.0947265625, -0.064453125, -0.89453125, -0.431640625, 0.1103515625, -0.7421875, 0.1533203125, -1.015625, -0.28125, -0.12353515625, -0.482421875, -0.302734375, 0.0234375, 0.56640625, -1.1875, 0.07666015625, -0.59765625, 0.83984375, 0.44921875, 0.310546875, 0.1787109375, 1.171875, 1.3359375, 0.408203125, -0.08984375, -0.333984375, -0.828125, -0.158203125, 0.0830078125, 0.341796875, 0.69140625, -0.67578125, 0.134765625, 0.1435546875, 0.90234375, 0.365234375, -0.162109375, 0.109375, 0.09716796875, 0.2734375, -0.76171875, 0.8515625, -0.66796875, -0.138671875, 0.37890625, -0.07275390625, -0.01312255859375, -0.53125, 0.55859375, -0.93359375, 0.443359375, -0.023193359375, 0.1962890625, -0.427734375, 0.255859375, -0.51953125, -0.44921875, 0.7578125, -1.171875, -0.5078125, -0.35546875, 0.0751953125, -0.6640625, -0.9609375, -0.416015625, -0.294921875, 0.248046875, -0.302734375, 0.6015625, 0.169921875, 0.34765625, -0.0634765625, 0.166015625, 0.93359375, 0.482421875, -1.0625, 0.1865234375, 0.412109375, -0.1328125, 0.1572265625, -0.1123046875, -0.345703125, 1.28125, -1.2265625, 0.027099609375, -0.21875, 0.08349609375, -0.2578125, -0.8125, -0.05712890625, 0.388671875, 0.416015625, -0.279296875, -0.265625, 0.7109375, 0.3515625, 1.25, 0.443359375, 0.71875, -0.484375, -0.08837890625, -0.75, 0.08984375, -0.2578125, -0.419921875, 1.0546875, -0.4453125, 0.63671875, -0.482421875, 0.69140625, -0.37890625, -0.1298828125, -0.482421875, -0.05859375, 0.494140625, -0.2060546875, 0.00042724609375, -0.546875, -0.046875, -0.59375, 0.1240234375, 0.734375, -0.490234375, 0.50390625, 0.6015625, 0.6328125, 0.0771484375, -0.0030364990234375, -0.032470703125, 0.208984375, -0.2890625, 0.31640625, -0.63671875, 0.16796875, -0.9375, 0.447265625, 1.0234375, 0.2255859375, -0.80078125, 0.1748046875, 0.4296875, -0.416015625, 0.35546875, -0.703125, -0.376953125, -0.134765625, -0.36328125, -0.1357421875, 0.31640625, 0.07568359375, -0.427734375, 0.828125, -0.828125, -0.138671875, -0.9375, -0.62890625, 0.66015625, -0.06640625, 0.6875, -0.91796875, -0.5078125, 0.93359375, 0.361328125, 0.2177734375, -0.40234375, -0.275390625, 0.162109375, -0.3046875, -0.1220703125, -0.1298828125, -0.2216796875, 0.220703125, 0.2578125, -0.60546875, 0.50390625, -0.5234375, 0.3359375, 0.033447265625, -0.39453125, -0.3203125, 0.041259765625, -0.00799560546875, -0.1181640625, 0.1669921875, 0.125, -0.197265625, 0.458984375, -0.23828125, -0.2734375, -1.0078125, 0.58203125, 0.07763671875, -0.07421875, 0.306640625, 0.466796875, 0.045166015625, -0.314453125, 0.427734375, -0.328125, 0.484375, -0.79296875, 0.875, -0.58984375, 0.5625, 0.5703125, 0.380859375, -0.2109375, 0.1962890625, 0.08642578125, -0.0177001953125, -0.328125, -0.2412109375, -0.0849609375, -0.396484375, -0.2099609375, -0.97265625, -0.54296875, 0.06689453125, -0.54296875, -0.42578125, -0.474609375, -0.2373046875, 0.173828125, 0.1416015625, 0.2275390625, 0.32421875, -0.11083984375, -0.40625, 0.60546875, 0.07177734375, -1.0703125, 0.349609375, 0.134765625, -0.6171875, -0.37890625, -0.427734375, 0.7734375, 0.5546875, 0.365234375, -1.03125, 0.4921875, -0.828125, -1.140625, 0.1787109375, 0.197265625, -0.33984375, -0.1201171875, -0.69921875, -0.169921875, 0.4296875, 0.1650390625, -0.0888671875, 0.35546875, -0.1572265625, 0.02880859375, -1.046875, 0.51171875, -0.91796875, 0.2353515625, 0.1611328125, 0.388671875, -0.30859375, -0.212890625, -0.0235595703125, 0.890625, -0.244140625, 0.1611328125, 0.134765625, -0.48828125, 0.220703125, 0.1416015625, 0.38671875, 0.34375, 0.62890625, -0.1845703125, 0.388671875, -0.0166015625, -0.365234375, 0.14453125, -0.287109375, -0.310546875, 0.06591796875, 0.310546875, -0.2265625, -0.921875, 0.30859375, -0.26953125, 0.4296875, -0.1298828125, -0.0654296875, -0.453125, -0.193359375, 0.578125, 0.2099609375, -0.390625, 0.43359375, 0.0137939453125, -0.80078125, 0.8671875, -0.1220703125, 0.466796875, 0.1318359375, 0.09326171875, -0.99609375, 0.0869140625, -0.0037384033203125, -0.314453125, 0.3828125, -0.58203125, -0.173828125, 0.1552734375, -0.037353515625, 0.35546875, 0.0400390625, 0.248046875, -0.1748046875, 1.171875, 0.474609375, 0.76953125, 0.1513671875, -0.51171875, -0.03955078125, 0.1845703125, 1.2890625, 0.236328125, -0.56640625, -0.306640625, 0.06103515625, -0.5625, 0.7421875, -0.16796875, 0.061767578125, 0.1328125, 0.298828125, -0.26171875, -0.609375, -0.06494140625, 0.419921875, -0.185546875, 0.498046875, -0.349609375, 0.80859375, 0.1708984375, 0.1298828125, -0.09814453125, -0.1552734375, 0.1650390625, -0.2451171875, -0.921875, 0.322265625, -0.462890625, -0.263671875, -0.0390625, 0.43359375, 0.390625, -0.08154296875, 0.48828125, 0.7890625, 0.154296875, -0.75390625, 0.34765625, -0.5546875, -0.08349609375, -0.50390625, 0.0771484375, 0.498046875, 0.322265625, 0.267578125, 0.2060546875, -0.162109375, 0.2353515625, -0.2431640625, -0.039794921875, -0.154296875, 0.16796875, 0.83984375, 0.259765625, -0.5546875, -0.166015625, 0.84765625, -0.78515625, 0.1708984375, -0.83984375, -0.703125, 0.53515625, 0.1767578125, 0.0791015625, -0.05322265625, 0.443359375, 0.021484375, -0.416015625, 0.62109375, -0.49609375, 0.072265625, 0.8671875, 0.263671875, -0.06396484375, 0.1865234375, -0.353515625, 0.65625, -0.58984375, 0.181640625, -0.03125, -0.353515625, 0.796875, 0.50390625, 0.234375, -1.0234375, -0.271484375, 0.3046875, 0.294921875, 0.173828125, -0.0103759765625, 0.375, -0.478515625, -0.6953125, 0.451171875, 0.4453125, -0.169921875, -0.1513671875, -0.6796875, -0.3046875, -0.2734375, 0.64453125, 0.2216796875, -0.11572265625, 0.1767578125, 0.07568359375, -0.33203125, 0.2470703125, -0.125, -0.3828125, -0.0294189453125, 0.1591796875, 1.125, -0.458984375, 0.1708984375, -0.09619140625, 0.419921875, -0.10107421875, 0.470703125, -0.5234375, -0.1220703125, 0.55078125, -0.400390625, 0.000820159912109375, -0.1259765625, -0.244140625, 0.49609375, -0.158203125, 0.06591796875, -0.053955078125, -0.09716796875, 0.55859375, 0.244140625, -0.322265625, -0.39453125, 0.50390625, 1.578125, -0.671875, 0.109375, 0.0023040771484375, -0.050048828125, 0.0283203125, 0.236328125, -0.2578125, 0.57421875, -0.61328125, 0.12451171875, 0.173828125, 0.373046875, -0.6796875, 0.57421875, 0.06591796875, 0.267578125, -0.0218505859375, -0.3125, 0.1064453125, -0.474609375, -1.203125, 0.357421875, 0.337890625, -0.087890625, 0.294921875, 0.859375, 0.609375, 0.4921875, 0.5390625, 0.283203125, -0.0673828125, -0.9609375, 0.01483154296875, 1.265625, -0.4609375, 0.98828125, -0.60546875, 0.216796875, 0.279296875, -0.021728515625, -0.44140625, 0.158203125, -1.484375, 0.9765625, 0.546875, 0.76171875, 0.890625, 0.49609375, -0.515625, 0.5859375, -0.193359375, 0.279296875, 0.271484375, -0.76171875, 0.0625, 0.63671875, 0.07861328125, 0.1826171875, -0.181640625, 0.51953125, 0.5390625, -0.154296875, -0.310546875, -0.310546875, -0.64453125, 0.234375, 0.310546875, -0.4140625, 1.046875, 0.02392578125, -1.359375, 0.07470703125, 0.1015625, -0.034423828125, -0.375, -0.7109375, 0.236328125, 1.1015625, -0.61328125, -0.486328125, 0.0029296875, 0.00982666015625, -0.034423828125, 0.3671875, -0.64453125, 0.1630859375, -0.84765625, -0.0233154296875, 0.263671875, -0.65234375, -0.44140625, 0.62890625, -0.39453125, 0.1328125, 0.06103515625, -0.6484375, -0.00531005859375, -0.75390625, 0.72265625, -0.1689453125, 1.390625, 0.05029296875, -0.000873565673828125, -0.1376953125, -0.220703125, 0.6015625, -0.09716796875, 0.10400390625, 0.453125, -0.330078125, -0.169921875, -0.08544921875, 0.466796875, -0.3984375, -0.00193023681640625, -0.0751953125, -0.220703125, 0.2060546875, 0.28125, 0.138671875, -0.08447265625, -0.158203125, -0.365234375, 0.361328125, 0.09375, -0.06298828125, 0.1328125, 0.8984375, 0.008544921875, 1.1484375, -0.412109375, -0.4375, -0.30078125, 0.283203125, 0.3203125, 0.41796875, 0.203125, -0.2373046875, -0.65234375, -0.4609375, -0.314453125, 0.12060546875, -0.58984375, 0.208984375, 0.390625, -0.275390625, -0.12109375, -0.5625, -0.85546875, 0.003692626953125, -0.130859375, 0.88671875, 0.35546875, 0.07666015625, -0.546875, -0.5390625, 0.87109375, 0.208984375, 0.96484375, -0.69921875, -0.8671875, 0.58984375, 0.45703125, 0.69921875, -0.96875, 0.8515625, -0.0257568359375, 0.59765625, 0.0291748046875, 0.6640625, 0.06640625, 0.83984375, 0.6875, -0.31640625, -0.142578125, -0.04248046875, -0.78125, 0.0186767578125, -1.03125, 0.026123046875, -0.341796875, -0.349609375, 0.275390625, 0.412109375, 0.22265625, 0.419921875, -0.326171875, -0.765625, -0.3671875, 0.16796875, 0.8671875, -0.384765625, 0.1396484375, -0.07666015625, 0.890625, -0.2236328125, 0.1767578125, 0.328125, 0.384765625, 0.5078125, -0.1123046875, 0.1728515625, 0.1953125, -0.41796875, 0.796875, -0.035888671875, 0.408203125, 0.3046875, 0.12451171875, 6.389617919921875e-05, 0.1748046875, 0.22265625, -0.69921875, 0.373046875, -0.6015625, 0.1484375, 0.318359375, 0.330078125, -0.0289306640625, -0.2158203125, -0.69140625, 0.2177734375, 0.1796875, -0.023681640625, -0.31640625, 0.9140625, 0.306640625, 0.60546875, 0.2890625, 0.62890625, 0.1220703125, -0.75, -1.03125, 0.2197265625, 0.08251953125, 0.0191650390625, -0.345703125, -0.578125, 0.78125, -0.2021484375, 0.625, 0.00689697265625, 0.44140625, -0.053466796875, 0.4296875, 0.94140625, 0.07177734375, 0.189453125, 0.193359375, 0.76953125, 0.41796875, 0.11572265625, -0.6875, 0.75390625, 0.6796875, -1.140625, 0.3203125, 0.5546875, -0.0966796875, 0.265625, 0.306640625, 0.80859375, 0.421875, 0.34375, 0.043701171875, -0.72265625, -0.62890625, -0.03515625, -0.65625, -0.28125, -0.0179443359375, -0.423828125, -0.58203125, 0.390625, -0.71875, -0.1787109375, 0.050537109375, 0.03173828125, 0.01397705078125, -0.19921875, -0.546875, 0.482421875, -0.6171875, -0.61328125, -0.5234375, -0.00738525390625, -0.16796875, 0.17578125, -0.466796875, 0.21484375, -0.1728515625, -0.30859375, 0.64453125, 0.7109375, -1.1015625, -0.296875, -0.455078125, 0.0302734375, -0.234375, -0.76953125, 0.1630859375, -0.107421875, 0.263671875, 0.0277099609375, -0.3359375, 0.8203125, 0.33984375, -0.984375, -0.5859375, 0.427734375, -0.126953125, -0.07470703125, 0.69921875, 0.546875, -0.2890625, 0.271484375, 0.73828125, -0.1005859375, -0.212890625, -0.208984375, -0.451171875, -0.12060546875, 0.416015625, 0.1142578125, -0.1455078125, 0.703125, 0.71875, -0.404296875, -0.00118255615234375, -0.96875, 0.73828125, -0.3046875, 0.33984375, 0.07666015625, 0.3125, -0.0517578125, 0.045166015625, 0.396484375, 0.203125, -0.32421875, -0.29296875, -0.09765625, -0.38671875, 0.40625, 0.1328125, 0.298828125, -0.11328125, 0.09521484375, -0.07421875, -0.625, -0.099609375, 0.2216796875, -0.984375, -0.2734375, 0.095703125, -0.2255859375, -0.058349609375, 0.255859375, -0.298828125, -0.220703125, 0.29296875, 0.66796875, 0.341796875, -0.52734375, -0.166015625, -0.1806640625, 0.64453125, -0.52734375, 0.0908203125, -0.72265625, -0.107421875, 0.703125, 0.09375, -0.07958984375, -0.41015625, 0.37890625, -0.400390625, 0.076171875, -0.4296875, -1.5703125, -0.57421875, -0.06884765625, 0.0732421875, -0.201171875, 0.4375, -0.1025390625, 0.703125, 0.4296875, 0.78125, -0.0849609375, -0.8125, -0.86328125, -0.30078125, -0.6875, 0.0238037109375, 0.396484375, 0.400390625, 0.006500244140625, 0.56640625, 0.474609375, -1.140625, 0.609375, 0.029052734375, 0.0703125, 0.416015625, 0.142578125, 0.2412109375, 0.1572265625, 0.259765625, -0.158203125, -0.2109375, 0.1552734375, 0.49609375, -0.111328125, -0.1591796875, -0.71484375, 0.0771484375, 0.0673828125, 0.09130859375, 0.37890625, -0.498046875, -0.35546875, 0.26953125, -0.310546875, 0.1142578125, -0.203125, -0.0223388671875, -0.1728515625, 0.1064453125, 0.22265625, 0.150390625, 0.7578125, -0.10888671875, 0.6640625, -0.67578125, 0.310546875, 0.19140625, 0.330078125, 0.37890625, -0.1455078125, 1.28125, -0.00823974609375, 0.5390625, 0.119140625, -0.08544921875, -0.84375, 0.62109375, -0.96875, -1.359375, -0.4921875, -0.0257568359375, 0.486328125, 0.890625, -0.0042724609375, 0.1826171875, -0.54296875, 0.0303955078125, -0.484375, -0.2421875, 0.703125, -0.4609375, -0.166015625, 0.1591796875, -0.6875, -0.73046875, -0.1259765625, 0.451171875, -0.5078125, -0.271484375, -0.25, 0.419921875, 0.16796875, 0.484375, 0.8203125, -0.21484375, -0.81640625, -0.0174560546875, 0.78515625, -0.1953125, -0.158203125, 0.5703125, 0.390625, 0.1728515625, 0.0908203125, 0.32421875, -0.255859375, -0.1015625, 0.07568359375, -0.0181884765625, -1.03125, 1.421875, 0.1962890625, 0.15234375, -0.0096435546875, 1.265625, -0.59375, -0.546875, -0.1826171875, -0.66015625, 0.0311279296875, 0.376953125, 0.50390625, -0.0390625, -0.057861328125, -0.486328125, 0.19921875, 0.7890625, 0.9140625, -0.130859375, -0.82421875, -0.62109375, 0.30859375, -0.302734375, 0.8046875, 0.63671875, 0.470703125, 0.62109375, -0.09521484375, 0.4453125, 0.875, -0.298828125, -0.376953125, 0.2138671875, 0.130859375, 0.65234375, 0.29296875, 0.2080078125, 0.9296875, -0.408203125, -0.408203125, -0.318359375, -0.00994873046875, 0.1357421875, -1.046875, -0.400390625, -0.1455078125, 0.26171875, -0.61328125, -0.88671875, 0.50390625, 0.1640625, -0.2041015625, -0.58203125, -0.703125, 0.59375, 0.10498046875, 0.3125, 0.83984375, 0.056640625, 0.04931640625, -0.224609375, 0.1845703125, -0.302734375, -0.00946044921875, -0.1533203125, -0.3671875, -0.93359375, 0.115234375, 0.0262451171875, 0.515625, -0.169921875, 0.228515625, -0.333984375, 0.1142578125, -0.125, -0.369140625, 0.388671875, -0.94140625, -0.640625, -0.341796875, -0.8203125, 0.2197265625, -0.306640625, -0.1396484375, -0.09228515625, -0.23828125, 0.7265625, 0.76953125, -0.33984375, 0.20703125, -0.033447265625, -0.7265625, 0.625, -0.1337890625, 0.375, -0.29296875, 0.5546875, -0.6796875, -0.357421875, -0.08349609375, 0.369140625, 0.490234375, 0.40625, 0.2158203125, 0.5078125, 0.671875, 0.140625, 0.328125, -0.4375, -0.15234375, -0.59765625, -0.185546875, 0.306640625, -0.14453125, -0.318359375, 0.99609375, -0.5625, 0.6875, 0.26171875, -0.3828125, -0.01129150390625, 0.2158203125, -0.0419921875, 1.1796875, -0.447265625, 0.439453125, 0.8359375, 0.4765625, 0.2216796875, 0.1083984375, -0.1591796875, 0.58984375, 0.04248046875, -0.6640625, 0.5234375, 0.130859375, -0.7734375, 0.248046875, 0.7109375, -0.5546875, -0.271484375, 0.75390625, -0.92578125, 0.177734375, 0.051513671875, 0.4140625, -0.166015625, -0.041259765625, 0.4453125, -0.3671875, 0.7421875, -0.4609375, -0.498046875, -0.09033203125, 0.00958251953125, 0.61328125, -0.13671875, -0.8515625, 0.29296875, -0.099609375, -0.361328125, 0.33984375, -0.1611328125, 0.37890625, -0.51171875, 0.1875, -0.58203125, 1.1328125, 0.1357421875, 0.5078125, -0.302734375, -0.1669921875, 0.6953125, -0.55859375, -0.037353515625, -0.049072265625, -1.84375, 0.46484375, -0.46484375, 0.2216796875, -0.69140625, 0.052001953125, -0.034423828125, -0.3125, 0.30859375, -0.419921875, -0.11572265625, 0.1083984375, 0.2412109375, 0.7421875, 0.10986328125, -0.458984375, -1.09375, 0.328125, -0.3671875, 0.07421875, -0.61328125, -0.126953125, -1.0625, 0.38671875, -0.130859375, 0.5390625, 0.26171875, 0.24609375, 0.06689453125, 0.48828125, -0.498046875] which is a list in upsert.

Try filtering complex metadata from the document using langchain_community.vectorstores.utils.filter_complex_metadata.

In [19]:
# Embedding function using Bedrock embeddings
def get_embedding_function():
    embeddings = BedrockEmbeddings()
    return embeddings

# Function to split documents into chunks
def split_documents(documents: list[Document]):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=10000,
        chunk_overlap=5000,
        length_function=len
    )
    return splitter.split_documents(documents)

# Function to calculate unique chunk IDs for documents
def calculate_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the chunk metadata
        chunk.metadata["id"] = chunk_id

    return chunks

# Function to generate the database from the documents
def generate_database(path):
    # Load the documents
    # loader = PyPDFDirectoryLoader(path)
    loader = PyPDFLoader(path)
    documents = loader.load()
    
    # Split the documents into chunks
    documents = split_documents(documents)
    
    # Initialize the Chroma database
    database = Chroma(
        persist_directory=DATABASE_PATH,
        embedding_function=get_embedding_function()
    )
    
    # Calculate IDs for chunks
    chunks_with_ids = calculate_chunk_ids(documents)
    
    existing_items = database.get(include=[])  # Retrieve existing document IDs
    existing_ids = set(existing_items["ids"])
    
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # # Add only new documents to the database
    new_chunks = []
    for chunk in chunks_with_ids:
         if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)
    return new_chunks
    if new_chunks:
        print(f"Adding new documents to the database: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]

        database.add_documents(new_chunks, ids=new_chunk_ids)

        database.persist()
    else:
        print("All the documents are added")
    
    return documents

# Run the function
# generate_database()

In [83]:
report_path = "../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf"
new_chunks = generate_database(report_path)

Number of existing documents in DB: 282


In [91]:
d = new_chunks[22]
type(d)

langchain_core.documents.base.Document

In [85]:
new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
new_chunk_ids

['../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:0:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:1:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:2:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:3:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:4:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:5:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:6:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:7:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:8:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:9:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:10:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:11:0',
 '../data/docs/Consommation de Base/Métro/2021_Rapport annuel_FR.pdf:12:0',
 '../data/docs/Consomm

In [78]:
loader = PyPDFLoader(report_path)
documents = loader.load()

# Split the documents into chunks
documents = split_documents(documents)
documents

[Document(metadata={'source': '../data/docs/Consommation de Base/Métro/2020-annuel-10Q-FR-FINAL.pdf', 'page': 0}, page_content='Rapport annuel\n2020'),
 Document(metadata={'source': '../data/docs/Consommation de Base/Métro/2020-annuel-10Q-FR-FINAL.pdf', 'page': 1}, page_content="PROFIL CORPORATIF\nMETRO INC. est un chef de file dans les domaines de l'alimentation et de la pharmacie au Québec et en Ontario. Par \nses activités de détaillant, franchiseur, distributeur et fabricant, elle exploite ou approvisionne un réseau de 953 \nmagasins d’alimentation sous plusieurs bannières dont Metro, Metro Plus, Super C, Food Basics, Adonis et Première \nMoisson, de même que 648 pharmacies principalement sous les bannières Jean Coutu, Brunet, Metro Pharmacy et \nFood Basics Pharmacy, qui procurent de l'emploi directement ou indirectement à près de 90 000 personnes.\nFAITS SAILLANTS 2020 \n• Chiffre d'affaires de 17 997,5 millions $, en hausse de 7,3 % et de 7,7 % en excluant l'impact de l'IFRS 16\

In [79]:
len(documents)

95

In [20]:
def get_embedding_function():
    return BedrockEmbeddings()

def get_relevant_context(query, prompt, k = 5):
    embedding_function_instance = get_embedding_function()
    db = Chroma(persist_directory=DATABASE_PATH, embedding_function=embedding_function_instance)

    results = db.similarity_search_with_score(query, k = 5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(prompt)

    prompt_text = prompt_template.format(context=context_text, question=query)

    conversation = [
        {
            "role": "user",
            "content": [{"text": prompt_text}]  # Wrap the prompt in a list of dictionaries
        }
    ]

    return conversation, results

def query_llm(conversation, client, model_id):
    try:
        # Send the message to the model, using a basic inference configuration
        response = client.converse(
                    modelId=model_id,
                    messages=conversation,
                    inferenceConfig={"maxTokens": 4000, "temperature": 1},
                    additionalModelRequestFields={"top_k": 250, "top_p": 1},
        )

        # Extract and print the response text
        return response["output"]["message"]["content"][0]["text"]
        #print(response_text)

    except (ClientError, Exception) as e:
        print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
        exit(1)

In [58]:
model_id = "anthropic.claude-3-haiku-20240307-v1:0"
client = boto3.client("bedrock-runtime", region_name="us-west-2")

In [76]:
query_text = """
    Basic financial indicators: Turnover, gross margin, free cash flow, net debt, profit (before interest,
    taxes, depreciation and amortization), earnings per share, for Metro company in 2023 metro metro metro
"""

PROMPT_TEMPLATE = """
                    <meta>
                        current year: 2024
                        role: financial analyst
                        language: english
                        expertise: finance, annual reports, financial performance
                        tone: professional, concise, objective
                        response style: factual, summary-focused
                    <meta>

                    You are a financial analyst. Answer the following question in English, using only the information from the provided context.

                    Question: {question}

                    Context:  {context}

                    Output: Response should be a table with all the required variables as rows, different years as columns, variation as column, leave it empty if the value does not exist.

                    Answer in a clear and concise manner, focusing on key financial insights.
                    """

In [77]:
# Format the sources and print
conversation, relevant_results = get_relevant_context(query=query_text, prompt=PROMPT_TEMPLATE, k = 5)

response_text = query_llm(conversation, client, model_id)

sources = [doc.metadata.get("id", None) for doc, _score in relevant_results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response: Here is a table with the key financial indicators for Metro company in 2023:

| Metric | 2023 |
|--------|------|
| Turnover | - |
| Gross margin | - |
| Free cash flow | - |
| Net debt | - |
| Profit (before interest, taxes, depreciation and amortization) | - |
| Earnings per share | - |

Based on the provided context, the specific financial results for Metro company in 2023 were not disclosed. The information given focused on Metro's share buyback program, dividend payments, and stock performance, but did not provide the detailed financial metrics requested in the question. Therefore, I am unable to populate the table with the specific 2023 values for Metro. The context indicates Metro had a successful year, with increases in revenue, net income, and earnings per share, but the exact figures were not provided.
Sources: ['../data/docs/Consommation de Base/Métro/2020-annuel-10Q-FR-FINAL.pdf:25:0', '../data/docs/Consommation de Base/Métro/2020-annuel-10Q-FR-FINAL.pdf:13:0', '.

In [62]:
# Format the sources and print
conversation, relevant_results = get_relevant_context(query=query_text, prompt=PROMPT_TEMPLATE, k = 5)

response_text = query_llm(conversation, client, model_id)

sources = [doc.metadata.get("id", None) for doc, _score in relevant_results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response: Here is the requested financial information for ROGERS company in a table format:

| Metric                           | 2023 | 2022 | Variation |
|----------------------------------|------|------|------------|
| Turnover                         | 19,308 | 15,396 | 25% |
| Gross Margin                    | N/A | N/A | N/A |
| Free Cash Flow                   | 2,414 | 1,773 | 36% |
| Net Debt                         | N/A | N/A | N/A |
| Profit (EBITDA)                  | 8,581 | 6,393 | 34% |
| Earnings per Share               | 4.60 | 3.79 | 21% |

Key Insights:
- Turnover increased by 25% in 2023 compared to 2022, driven by growth in both service revenue (+27%) and equipment revenue (+17%).
- Free cash flow grew by 36%, reaching $2,414 million, reflecting the strong operating performance.
- EBITDA increased by 34% to $8,581 million, with the EBITDA margin expanding by 2.9 percentage points to 44.4%.
- Adjusted earnings per share rose by 21% to $4.60, showcasing the company'

In [13]:



# # Query and generate context for the prompt
# query_text = "Combien Bell Canada a tiré un produit brut totalisant à la suite de l'émission de débentures à moyen terme (MTN) de 7 ans et de 10 ans ?"


# results = db.similarity_search_with_score(query_text, k=5)

# context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
# prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
# prompt = prompt_template.format(context=context_text, question=query_text)

# # Prepare the conversation payload for Bedrock
# conversation = [
#     {
#         "role": "user",
#         "content": [{"text": prompt}]  # Wrap the prompt in a list of dictionaries
#     }
# ]

# Set up the model client
model_id = "anthropic.claude-3-haiku-20240307-v1:0"
client = boto3.client("bedrock-runtime", region_name="us-west-2")

try:
    # Send the message to the model, using a basic inference configuration
    response = client.converse(
                modelId=model_id,
                messages=conversation,
                inferenceConfig={"maxTokens": 4000, "temperature": 1},
                additionalModelRequestFields={"top_k": 250, "top_p": 1},
    )

    # Extract and print the response text
    response_text = response["output"]["message"]["content"][0]["text"]
    #print(response_text)

    # Format the sources and print
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

except (ClientError, Exception) as e:
    print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
    exit(1)

Response: Selon les informations fournies dans le rapport annuel 2019 de BCE Inc., Bell Canada a émis avec succès des débentures à moyen terme (MTN) d'une durée de 7 ans et de 10 ans, totalisant un produit brut de 1,15 milliard de dollars canadiens.

Spécifiquement:

- Le 10 septembre 2019, Bell Canada a émis des MTN, série M-50, à 2,90%, d'un capital de 550 millions de dollars, échéant le 10 septembre 2029. 

- Le 13 mai 2019, Bell Canada a émis des MTN, série M-49, à 2,75%, d'un capital de 600 millions de dollars, échéant le 29 janvier 2025.

Donc, le produit brut total tiré de l'émission de ces deux séries de débentures MTN de 7 ans et 10 ans s'élève à 1,15 milliard de dollars canadiens.
Sources: ['data/docs/2019-rapport-annuel-bce.pdf:146:0', 'data/docs/2018-rapport-annuel-bce.pdf:157:0', 'data/docs/2019-rapport-annuel-bce.pdf:36:0', 'data/docs/2018-rapport-annuel-bce.pdf:155:0', 'data/docs/2018-rapport-annuel-bce.pdf:38:0']


In [None]:
["give me turnover based on the report", "give earnings per share based on the report", ...]