In [18]:
# Install necessary libraries
!pip install pytesseract Pillow requests langchain_community langchain_core langchain_huggingface pytesseract langchain PIL 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [37]:
import pytesseract
from PIL import Image
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from langchain.docstore.document import Document
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama
from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder
)
from langchain.chains import create_history_aware_retriever
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage
from langchain_huggingface import HuggingFaceEmbeddings
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

In [38]:
# Load the image of the income statement
image_path = './data/test/is.jpg'
image = Image.open(image_path)

# Use Tesseract to extract text from the image
extracted_text = pytesseract.image_to_string(image)
# Save the extracted text to a text file
output_text_file = './data/test/extracted_text.txt'
with open(output_text_file, 'w') as file:
    file.write(extracted_text)

print(f"Extracted text has been saved to {output_text_file}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Extracted text has been saved to ./data/test/extracted_text.txt


In [39]:
# local_llm = 'gemma'
local_llm = 'llama3'
# local_llm = 'llama3.1'
# local_llm = 'mistral'

In [40]:
# Directory to check
isParserData = "./data/test"
directories = [isParserData]

# List to store file paths
txt_file_paths = []

# Check for files in the directory
for directory in directories:
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            txt_file_paths.append(os.path.join(directory, filename))

docs_list = []

# Load TXT files
for txt_path in txt_file_paths:
    try:
        with open(txt_path, 'r', encoding='utf-8') as file:
            txt_content = file.read()
        docs_list.append(Document(page_content=txt_content, metadata={"source": txt_path}))
    except Exception as e:
        print(f"Error loading {txt_path}: {e}")

# Split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=0)
doc_splitter = text_splitter.split_documents(docs_list)

# Filter and clean metadata
filtered_doc = []
for doc in doc_splitter:
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_doc.append(Document(page_content=doc.page_content, metadata=clean_metadata))

# Add to vectorDB
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=filtered_doc,
    collection_name="rag-chroma",
    embedding=embedding,
)

retriever = vectorstore.as_retriever()

In [22]:
llm = ChatOllama(model=local_llm, temperature=0)


In [24]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [25]:
qa_system_prompt = """system You are an assistant for question-answering tasks. Use the following context to answer the question. Avoid phrases like "Based on the provided context". Explain the answer in the end. and make a heading with paragraph.
Question: {input}
Context: {context}
Answer: assistant"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [26]:
chat_history = []

In [27]:
question = "can you analyze this data and explain"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg_1["answer"]])

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


In [28]:
print(ai_msg_1["answer"])


**Financial Analysis of a Company**

The provided financial data is for the year ended September 28, 2019. Here's an analysis of the key figures:

**Revenue and Gross Profit**
The company reported net sales of $4,358,100, with a gross profit of $1,619,386. This indicates that the company has a strong revenue stream and is able to maintain a decent margin on its products or services.

**Operating Expenses**
Total operating expenses stood at $854,159, which includes selling and operating expenses ($560,430) and general and administrative expenses ($293,729). This suggests that the company has a significant expense base, which may impact its profitability.

**Operating Income and Other Income**
The company reported an operating income of $765,227, indicating that it was able to generate profits from its core operations. Additionally, it had other income of $960, which contributed to its overall profitability.

**Non-Operating Items**
The company recorded a gain on financial instruments of