In [102]:
import boto3
import numpy as np
from langchain_community.vectorstores import Chroma
import shutil
import sys
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings.bedrock import BedrockEmbeddings

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import BedrockEmbeddings
from langchain.vectorstores import Chroma

from botocore.exceptions import ClientError
from langchain.prompts import ChatPromptTemplate

import json

import psycopg2
import warnings
import pickle

import uuid

import tomllib

from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")

In [90]:
import glob
import os

# Path to the directory
directory_path = '../data/docs'

# Find all PDF files recursively
pdf_files = glob.glob(os.path.join(directory_path, '**', '*.pdf'), recursive=True)

In [98]:
def query_llm(conversation, client, model_id):
    try:
        # Send the message to the model, using a basic inference configuration
        response = client.converse(
                    modelId=model_id,
                    messages=conversation,
                    inferenceConfig={"maxTokens": 200, "temperature": 1},
                    additionalModelRequestFields={"top_k": 250, "top_p": 1},
        )

        # Extract and print the response text
        return response["output"]["message"]["content"][0]["text"]
        #print(response_text)

    except (ClientError, Exception) as e:
        print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
        exit(1)

def extract_name_year(splits, client, model_id, n_page = 3):

    doc_identifier = [*splits[:n_page], *splits[-n_page:]]
    doc_identifier = "\n\n---\n\n".join([doc.page_content for doc in doc_identifier])

    prompt_text = f"""

    Question : What is the name of the company, what financial year this report is about?

    Output : only json string format with two keys (company, year)

    Error : if you could not find anything the values in json should be empty strings

    Context : {doc_identifier}

    """

    conversation = [
            {
                "role": "user",
                "content": [{"text": prompt_text}]  # Wrap the prompt in a list of dictionaries
            }
        ]

    response = query_llm(conversation, client, model_id)
    return response

def add_name_year_tags(split_documents, info_dict):
    embedder = BedrockEmbeddings()
    modified_docs = []

    record_id = uuid.uuid4()

    for doc in split_documents:
        if info_dict['company'] and info_dict['year']:
            modified_content = f"""
                <company> {info_dict['company']} <company>
                <year> {info_dict['year']} <year>

                {doc.page_content}

                <company> {info_dict['company']} <company>
                <year> {info_dict['company']} <year>
                """
        else:
            modified_content = doc.page_content

        modified_docs.append(dict(company = info_dict['company'], year = info_dict['year'], embedding = embedder.embed_query(modified_content), content = modified_content, record_id = record_id))

    return modified_docs

In [99]:
model_id = "anthropic.claude-3-haiku-20240307-v1:0"
client = boto3.client("bedrock-runtime", region_name="us-west-2")

In [100]:
docs = []
for pdf_path in tqdm(pdf_files):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=5000)
    split_documents = text_splitter.split_documents(documents)

    info_dict = extract_name_year(split_documents, client, model_id, n_page = 3)

    info_dict = json.loads(info_dict)

    docs.extend(add_name_year_tags(split_documents, info_dict))

    with open("../data/database/db.pkl", "wb") as file:
        pickle.dump(docs, file = file)


  0%|          | 0/84 [00:00<?, ?it/s]

In [96]:
str(uuid.uuid4())

'f666772f-f6cc-47eb-893b-c81375f50f0b'

In [101]:
import tomllib