# Packages

In [3]:
import boto3
import numpy as np
from langchain_community.vectorstores import Chroma
import shutil
import sys
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings.bedrock import BedrockEmbeddings

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import BedrockEmbeddings
from langchain.vectorstores import Chroma

from botocore.exceptions import ClientError
from langchain.prompts import ChatPromptTemplate

import json

import psycopg2
import warnings
import pickle

import uuid
import time

import tomllib

from tqdm.notebook import tqdm

from haystack import Document as DocumentH
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack_integrations.components.embedders.amazon_bedrock import (
    AmazonBedrockDocumentEmbedder,
    AmazonBedrockTextEmbedder,
)
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

warnings.filterwarnings("ignore")

# Helpers

In [4]:
def query_llm(conversation, client, model_id):
    try:
        # Send the message to the model, using a basic inference configuration
        response = client.converse(
                    modelId=model_id,
                    messages=conversation,
                    inferenceConfig={"maxTokens": 4096, "temperature": 0},
                    additionalModelRequestFields={"top_k": 250, "top_p": 1},
        )

        # Extract and print the response text
        return response["output"]["message"]["content"][0]["text"]

    except (ClientError, Exception) as e:
        print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
        exit(1)

def append_prompt(template_dir):
    with open(template_dir, "rb") as f:
        settings = tomllib.load(f)

    prompts = [settings['instruction_prompt'], 
               settings['task_prompt'], 
               settings['example_prompt'], 
               settings['reasoning_prompt']]
    return "\n".join(prompts)
    

def assemble_analysis_prompt(content, template_dir):
    with open(template_dir, "rb") as f:
        settings = tomllib.load(f)

    message = [
        {"role" : "user", "content" : [{"text" : settings['role_prompt']}, 
                                       {"text" : settings['task_prompt']},
                                       {"text" : settings['example_prompt']},
                                       {"text" : settings['reasoning_prompt']},
                                       {"text" : settings['output_prompt']},
                                       {"text" : f">>>>>\n{content}\n<<<<<"},
                                       {"text" : settings['instruction_prompt']}]}
    ]
    return message

# Analysor

In [5]:
import glob
import os

# Path to the directory
directory_path = '../data/docs'

# Find all PDF files recursively
pdf_files = glob.glob(os.path.join(directory_path, '**', '*.pdf'), recursive=True)


os.environ["AWS_ACCESS_KEY_ID"] = "AKIAZXNNZJEPQOQ6SCAT"
os.environ["AWS_SECRET_ACCESS_KEY"] = "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"
os.environ["AWS_DEFAULT_REGION"] = "us-west-2"

model_id = "anthropic.claude-3-haiku-20240307-v1:0"
embedder_model_id = "amazon.titan-embed-text-v2:0"


client = boto3.client("bedrock-runtime", region_name="us-west-2")

In [6]:
def document_embedder_pipline(file_path, embedder_model_id):

    document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")

    loader = PyPDFLoader(file_path)
    documents = loader.load()

    documents = [DocumentH(content = d.page_content, meta = d.metadata) for d in documents]

    document_embedder = AmazonBedrockDocumentEmbedder(model=embedder_model_id, meta_fields_to_embed=["source"])
    documents_with_embeddings = document_embedder.run(documents)['documents']
    document_store.write_documents(documents_with_embeddings)

    query_pipeline = Pipeline()
    query_pipeline.add_component("text_embedder", AmazonBedrockTextEmbedder(model=embedder_model_id))
    query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
    query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

    return query_pipeline, document_store

def extract_relevent_and_prompt_llm(query_pipeline, template_dir, top_k = 10):
    query = append_prompt(template_dir)

    result = query_pipeline.run({"text_embedder":{"text": query}})

    relevant_results = result['retriever']['documents'][:top_k]

    relevant_results = sorted(relevant_results, key = lambda x : x.meta['page'])

    content = "\n\n -------------------- \n\n".join([d.content for d in relevant_results])

    analysis_prompt = assemble_analysis_prompt(content, template_dir)
    response = query_llm(analysis_prompt, client, model_id)
    try:
        response_dict = json.loads(response) 
        response_dict['pages'] = [r.meta['page'] for r in relevant_results]

        return response, response_dict
    except:
        print("Invalid format returned by LLM")
        return response
    
def llm_pipeline(query_pipeline, name):
    analysis = dict(name = name)

    response1 = extract_relevent_and_prompt_llm(query_pipeline, "../templates/analysis_basic_indicators.toml", top_k = 10)
    analysis['basic'] = {'text' : response1} if len(response1) == 1 else {'text' : response1[0], 'obj' : response1[1]}
    time.sleep(20)

    response2 = extract_relevent_and_prompt_llm(query_pipeline, "../templates/analysis_sector.toml", top_k = 10)
    analysis['sectore'] = {'text' : response2} if len(response2) == 1 else {'text' : response2[0], 'obj' : response2[1]}
    time.sleep(20)

    response3 = extract_relevent_and_prompt_llm(query_pipeline, "../templates/analysis_sentiment.toml", top_k = 10)
    analysis['sentiment'] = {'text' : response3} if len(response3) == 1 else {'text' : response3[0], 'obj' : response3[1]}
    time.sleep(20)

    return analysis

# Save Document_stores

In [None]:
document_store_save_path = "../data/doc_store/"

report_names = []

for file_path in tqdm(pdf_files):

    file_path_splits = file_path.split("/")
    report_name = file_path_splits[-2] + "_" + file_path_splits[-1]
    
    query_pipeline, document_store = document_embedder_pipline(file_path, embedder_model_id)

    document_store.save_to_disk(f"{document_store_save_path}{report_name}.ds")
    report_names.append(report_name)

  0%|          | 0/84 [00:00<?, ?it/s]

Creating embeddings: 100%|██████████| 208/208 [00:45<00:00,  4.54it/s]
Creating embeddings: 100%|██████████| 208/208 [00:42<00:00,  4.86it/s]
Creating embeddings: 100%|██████████| 216/216 [00:43<00:00,  4.93it/s]
Creating embeddings: 100%|██████████| 196/196 [00:38<00:00,  5.06it/s]
Creating embeddings: 100%|██████████| 196/196 [00:36<00:00,  5.31it/s]


# Load Document_stores and extract relevant chunks and pass to LLM

In [12]:
cache_file_path = "../data/cache.pkl"

document_store_save_path = "../data/doc_store/"

with open(cache_file_path, "rb") as file:
    cached_llm_results = pickle.load(file)

for file_path in tqdm(pdf_files[23:]):

    file_path_splits = file_path.split("/")
    report_name = file_path_splits[-2] + "_" + file_path_splits[-1]
    
    document_store = InMemoryDocumentStore.load_from_disk(f"{document_store_save_path}{report_name}.ds")

    query_pipeline = Pipeline()
    query_pipeline.add_component("text_embedder", AmazonBedrockTextEmbedder(model=embedder_model_id))
    query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
    query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

    try:
        analysis = llm_pipeline(query_pipeline, report_name)
    except Exception as e:
        print("First try lead to Throttling error - sleeping for 120 seconds")
        print(e)
        time.sleep(120)
        try:
            print("Second try started ...")
            analysis = llm_pipeline(query_pipeline, report_name)
        except Exception as e:
            print("Second try lead to Throttling error as well - sleeping for 600 seconds")
            time.sleep(600)
            try:
                print("Third try started ...")
                analysis = llm_pipeline(query_pipeline, report_name)
            except:
                print("Third try lead to Throttling error as well")
                print(f"Ignoring report {report_name} for now and sleeping for 600 seconds")
                time.sleep(600)
                continue

    cached_llm_results.append(analysis)

    with open(cache_file_path, "wb") as file:
        pickle.dump(cached_llm_results, file)

  0%|          | 0/61 [00:00<?, ?it/s]

Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
Invalid format returned by LLM
ERROR: Can't invoke 'anthropic.claude-3-haiku-20240307-v1:0'. Reason: An error occurred (ThrottlingException) when calling the Converse operation (reached max retries: 4): Too many requests, please wait before trying again. You have sent too many requests.  Wait before trying again.
Invalid format returned by LLM
First try lead to Throttling error - sleeping for 120 seconds
object of type 'NoneType' has no len()
Second try started ...
Invalid format returned by LLM
Invalid format returned by LLM
Inv

FileNotFoundError: File ../data/doc_store/CPKC_CP_AnnualReport_2022.pdf.ds not found.