# Packages

In [4]:
import boto3
import numpy as np
from langchain_community.vectorstores import Chroma
import shutil
import sys
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings.bedrock import BedrockEmbeddings

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import BedrockEmbeddings
from langchain.vectorstores import Chroma

from botocore.exceptions import ClientError
from langchain.prompts import ChatPromptTemplate

import json

import psycopg2
import warnings
import pickle

import uuid
import time


from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")

# Helpers

In [51]:
def query_llm(conversation, client, model_id):
    try:
        # Send the message to the model, using a basic inference configuration
        response = client.converse(
                    modelId=model_id,
                    messages=conversation,
                    inferenceConfig={"maxTokens": 4096, "temperature": 0},
                    additionalModelRequestFields={"top_k": 250, "top_p": 1},
        )

        # Extract and print the response text
        return response["output"]["message"]["content"][0]["text"]

    except (ClientError, Exception) as e:
        print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
        exit(1)


# Templates

In [7]:
import tomllib

# Load the .toml file
with open("../templates/analysis.toml", "rb") as f:
    config = tomllib.load(f)

# Access the configuration data
print(config)

{'prompts': {'instruction_prompt': 'Extract the turnover of the company in >>>>>CONTENT<<<<<< for the year of report and previous years if available.\nReturn the output as valid JSON.\n', 'role_prompt': "You are a throughly trained financial analyst that is expert in analyzing financial reports. \nYou diligently complete tasks as instructed.\nYou never make up any information that isn't there.\n", 'task_prompt': 'The information about turnover is usually reported in a table and it is compared with previous years turnover.\nYou have to extract the value of turnover, for all the years inside the report, from the table with its unit of measure.\n', 'example_prompt': '------START EXAMPLE 1------\n\nkey highlights\nyear, 2022, 2021, growth\nturnover, 1.2M$ CAD, 1.1M$ CAD, 9.1%\n\n------END EXAMPLE 1------\n\n------START EXAMPLE 2------\n\nkey financial highlights:\ninformation are in thousand dollars\nyear, 2023, 2022, 2021, 2020, 2019\nturnover, 82813, 84129, 785432, 79213, 65422\n\n------

# Analysor

In [212]:
import glob
import os

# Path to the directory
directory_path = '../data/docs'

# Find all PDF files recursively
pdf_files = glob.glob(os.path.join(directory_path, '**', '*.pdf'), recursive=True)

model_id = "anthropic.claude-3-haiku-20240307-v1:0"
client = boto3.client("bedrock-runtime", region_name="us-west-2")

In [234]:
def append_prompt(template_dir):
    with open(template_dir, "rb") as f:
        settings = tomllib.load(f)

    prompts = [settings['instruction_prompt'], 
               settings['task_prompt'], 
               settings['example_prompt'], 
               settings['reasoning_prompt']]
    return "\n".join(prompts)
    

def assemble_analysis_prompt(content, template_dir):
    with open(template_dir, "rb") as f:
        settings = tomllib.load(f)

    message = [
        {"role" : "user", "content" : [{"text" : settings['role_prompt']}, 
                                       {"text" : settings['task_prompt']},
                                       {"text" : settings['example_prompt']},
                                       {"text" : settings['reasoning_prompt']},
                                       {"text" : settings['output_prompt']},
                                       {"text" : f">>>>>\n{content}\n<<<<<"},
                                       {"text" : settings['instruction_prompt']}]}
    ]
    return message
    

In [222]:
pdf_files[60]

'../data/docs/Consommation de Base/Couche-Tard/AnnualReport2019_long_FR-Protege.pdf'

In [217]:
loader = PyPDFLoader(pdf_files[60])
documents = loader.load()
content = "\n\n -------------------- \n\n".join([d.page_content for d in documents[:10]])

In [216]:
toc_prompt = assemble_analysis_prompt(content, "../templates/analysis_table_of_content.toml")
toc_response = query_llm(toc_prompt, client, model_id)
print(toc_prompt)

[{'role': 'user', 'content': [{'text': ">>>>>ROLE<<<<<<\nYou are a throughly trained financial analyst that is expert in analyzing financial reports. \nYou diligently complete tasks as instructed.\nYou never make up any information that isn't there.\n>>>>>END OF ROLE<<<<<<\n"}, {'text': '>>>>>TASK<<<<<<\nBased on the table of contents provided in the >>>>>CONTENT<<<<<< extract the page number of the chapters that include the following information:\n1 - key financial highlights, elements, indicators\n2 - sector analysis, sector trends, competitions\n3 - executive comments and performance trends\nyour task is to find the "page start" and "page end" for each topic.\n>>>>>END OF TASK<<<<<<\n'}, {'text': 'No example is provided for the task.'}, {'text': '- Is the title of the chapter relevant to the topic and could include insightful content about the topic?'}, {'text': 'The output format should be JSON with the following structure.\n{\n    "description" : ...,\n    "topic1" : {\n        "s

In [220]:
r = query_llm([{"role" : "user", 
            "content" : [{"text" : "Based on the table of contents in the following report, which chapter is about financial highlights? give the start page and end page of the chapter. The output should be just two number. Response should be in english"}, 
                         {"text" : f">>>>>\n{content}\n<<<<<"}]}], 
          client, model_id)

In [221]:
r

'Based on the table of contents, the chapter about financial highlights is in the "Section sur les résultats financiers" which starts on page 22 and ends on page 127.'

In [139]:
analysis_prompt = assemble_analysis_prompt(content, "../templates/analysis_basic_indicators.toml")
response = query_llm(analysis_prompt, client, model_id)
print("[1] Key financial elementes retrieved")

analysis_prompt2 = assemble_analysis_prompt(content, "../templates/analysis_sector.toml")
response2 = query_llm(analysis_prompt2, client, model_id)
print("[2] Sector analysis completed")

analysis_prompt3 = assemble_analysis_prompt(content, "../templates/analysis_sentiment.toml")
response3 = query_llm(analysis_prompt3, client, model_id)
print("[3] Report sentiment analysis completed")

[1] Key financial elementes retrieved
[2] Sector analysis completed
[3] Report sentiment analysis completed


In [140]:
import json

d1 = json.loads(response)
d2 = json.loads(response2)
d3 = json.loads(response3)


In [None]:
import boto3
import numpy as np

# Initialize the Bedrock client

# Sample document sections
document_sections = [
    "Section 1 text about recent technology advancements...",
    "Section 2 text about market trends...",
    "Section 3 text on regulatory changes...",
    # Add more sections as needed
]

# Embed each section
section_embeddings = []
for section in document_sections:
    response = client.converse(
                modelId="amazon.titan-embed-text-v1",
                messages = [
    {
        "role": "user",
        "content": [{"text": section}],
    }
],
                inferenceConfig={"maxTokens": 4096, "temperature": 0},
                additionalModelRequestFields={"top_k": 250, "top_p": 1},
    )
    embedding = np.array(response['embedding'])
    section_embeddings.append(embedding)

# Convert to numpy array for easier processing
section_embeddings = np.array(section_embeddings)


In [391]:
from haystack import Document as DocumentH
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack_integrations.components.embedders.amazon_bedrock import (
    AmazonBedrockDocumentEmbedder,
    AmazonBedrockTextEmbedder,
)
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

os.environ["AWS_ACCESS_KEY_ID"] = "AKIAZXNNZJEPQOQ6SCAT"
os.environ["AWS_SECRET_ACCESS_KEY"] = "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"
os.environ["AWS_DEFAULT_REGION"] = "us-west-2"

embedder_model_id = "amazon.titan-embed-text-v2:0"

model_id = "anthropic.claude-3-haiku-20240307-v1:0"

client = boto3.client("bedrock-runtime", region_name="us-west-2")

def document_embedder_pipline(file_path, embedder_model_id):

    document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")

    loader = PyPDFLoader(file_path)
    documents = loader.load()

    documents = [DocumentH(content = d.page_content, meta = d.metadata) for d in documents]

    document_embedder = AmazonBedrockDocumentEmbedder(model=embedder_model_id, meta_fields_to_embed=["source"])
    documents_with_embeddings = document_embedder.run(documents)['documents']
    document_store.write_documents(documents_with_embeddings)

    query_pipeline = Pipeline()
    query_pipeline.add_component("text_embedder", AmazonBedrockTextEmbedder(model=embedder_model_id))
    query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
    query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

    return query_pipeline, document_store

def extract_relevent_and_prompt_llm(query_pipeline, template_dir, top_k = 10):
    query = append_prompt(template_dir)

    result = query_pipeline.run({"text_embedder":{"text": query}})

    relevant_results = result['retriever']['documents'][:top_k]

    relevant_results = sorted(relevant_results, key = lambda x : x.meta['page'])

    content = "\n\n -------------------- \n\n".join([d.content for d in relevant_results])

    analysis_prompt = assemble_analysis_prompt(content, template_dir)
    response = query_llm(analysis_prompt, client, model_id)
    try:
        response_dict = json.loads(response) 
        response_dict['pages'] = [r.meta['page'] for r in relevant_results]

        return response, response_dict
    except:
        print("Invalid format returned by LLM")
        return response
    
def llm_pipeline(query_pipeline, name, with_delays = 1):
    analysis = dict(name = name)

    response1 = extract_relevent_and_prompt_llm(query_pipeline, "../templates/analysis_basic_indicators.toml", top_k = 10)
    analysis['basic'] = {'text' : response1} if len(response1) == 1 else {'text' : response1[0], 'obj' : response1[1]}
    time.sleep(with_delays)

    response2 = extract_relevent_and_prompt_llm(query_pipeline, "../templates/analysis_sector.toml", top_k = 10)
    analysis['sectore'] = {'text' : response2} if len(response2) == 1 else {'text' : response2[0], 'obj' : response2[1]}
    time.sleep(with_delays)

    response3 = extract_relevent_and_prompt_llm(query_pipeline, "../templates/analysis_sentiment.toml", top_k = 10)
    analysis['sentiment'] = {'text' : response3} if len(response3) == 1 else {'text' : response3[0], 'obj' : response3[1]}
    time.sleep(with_delays)

    return analysis


def get_report_name(company_name, year, directory_path):
    all_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

    files = [f for f in all_files if year.lower() in f.lower()]

    files2 = [f for f in files if company_name.lower() in f.lower()]

    if len(files2) == 1:
        return f"{directory_path}{files2[0]}"

    report_query = f"{company_name}"
    fuzz_ratios = [(f, fuzz.ratio(f, report_query)) for f in files]
    report = sorted(fuzz_ratios, key = lambda x : -x[1])[0][0]
    return f"{directory_path}{report}"

def ai_financial_assistant(company_name, year):
    directory_path = '../data/doc_store/'

    report_name = get_report_name(company_name, year, directory_path)

    document_store = InMemoryDocumentStore.load_from_disk(report)

    query_pipeline = Pipeline()
    query_pipeline.add_component("text_embedder", AmazonBedrockTextEmbedder(model=embedder_model_id))
    query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
    query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

    
    analysis = llm_pipeline(query_pipeline, report_name)

    return analysis

In [None]:
document_store_save_path = "../data/doc_store/"

report_names = []

for file_path in tqdm(pdf_files[22:23]):

    file_path_splits = file_path.split("/")
    report_name = file_path_splits[-2] + "_" + file_path_splits[-1]
    
    query_pipeline, document_store = document_embedder_pipline(file_path, embedder_model_id)

    document_store.save_to_disk(f"{document_store_save_path}{report_name}.ds")
    report_names.append(report_name)

In [None]:
cache_file_path = "../data/cache.pkl"

document_store_save_path = "../data/doc_store/"

cached_llm_results = []

for file_path in tqdm(pdf_files):

    file_path_splits = file_path.split("/")
    report_name = file_path_splits[-2] + "_" + file_path_splits[-1]
    
    query_pipeline, document_store = document_embedder_pipline(file_path, embedder_model_id)
    try:
        analysis = llm_pipeline(query_pipeline, report_name)
    except Exception as e:
        print("First try lead to Throttling error - sleeping for 120 seconds")
        time.sleep(120)
        try:
            print("Second try started ...")
            analysis = llm_pipeline(query_pipeline, report_name)
        except Exception as e:
            print("Second try lead to Throttling error as well - sleeping for 600 seconds")
            time.sleep(600)
            try:
                print("Third try started ...")
                analysis = llm_pipeline(query_pipeline, report_name)
            except:
                print("Third try lead to Throttling error as well")
                print(f"Ignoring report {report_name} for now and sleeping for 600 seconds")
                time.sleep(600)
                continue

    cached_llm_results.append(analysis) 

    with open(cache_file_path, "wb") as file:
        pickle.dump(cached_llm_results, file)

In [328]:
with open(cache_file_path, "rb") as file:
    obj = pickle.load(file)

In [329]:
len(obj)

22

In [388]:
# pip install haystack
from thefuzz import fuzz

from haystack import Document as DocumentH
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack_integrations.components.embedders.amazon_bedrock import (
    AmazonBedrockDocumentEmbedder,
    AmazonBedrockTextEmbedder,
)
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

os.environ["AWS_ACCESS_KEY_ID"] = "AKIAZXNNZJEPQOQ6SCAT"
os.environ["AWS_SECRET_ACCESS_KEY"] = "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"
os.environ["AWS_DEFAULT_REGION"] = "us-west-2"

embedder_model_id = "amazon.titan-embed-text-v2:0"

In [392]:
def get_report_name(company_name, year, directory_path):
    all_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

    files = [f for f in all_files if year.lower() in f.lower()]

    files2 = [f for f in files if company_name.lower() in f.lower()]

    if len(files2) == 1:
        return f"{directory_path}{files2[0]}"

    report_query = f"{company_name}"
    fuzz_ratios = [(f, fuzz.ratio(f, report_query)) for f in files]
    report = sorted(fuzz_ratios, key = lambda x : -x[1])[0][0]
    return f"{directory_path}{report}"

def ai_financial_assistant(company_name, year):
    directory_path = '../data/doc_store/'

    report = get_report_name(company_name, year, directory_path)

    document_store = InMemoryDocumentStore.load_from_disk(report)

    query_pipeline = Pipeline()
    query_pipeline.add_component("text_embedder", AmazonBedrockTextEmbedder(model=embedder_model_id))
    query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
    query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

    
    analysis = llm_pipeline(query_pipeline, report_name)

    return analysis