In [None]:
import os
from unstructured.partition.pdf import partition_pdf
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

input_path = os.getcwd()
output_path = os.path.join(os.getcwd(), "output")

# Get elements
raw_pdf_elements = partition_pdf(
    filename=os.path.join(input_path, "visual instruction tuning.pdf"),
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=output_path,
)

In [None]:
text_elements = []
table_elements = []

for element in raw_pdf_elements:
    if 'CompositeElement' in str(type(element)):
        text_elements.append(element)
        print(element)
    elif 'Table' in str(type(element)):
        table_elements.append(element)
        print(element)

table_elements = [i.text for i in table_elements]
text_elements = [i.text for i in text_elements]

# Tables
print(len(table_elements))

# Text
print(len(text_elements))

In [None]:
import base64
image_elements = []
output_path = "C:\\Users\\DELL\\Multi_Modal_PDF_PPT_RAG\\figures"

# Function to encode images
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
    return encoded_image

for image_file in os.listdir(output_path):
    if image_file.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(output_path, image_file)
        encoded_image = encode_image(image_path)
        image_elements.append(encoded_image)
print(len(image_elements))

In [None]:
import os
from langchain_openai import ChatOpenAI
from langchain_community.llms import openai
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai
from langchain.schema.messages import HumanMessage
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

chain_gpt = ChatOpenAI(model="gpt-3.5-turbo", max_tokens=1024)
chain_gemini_pro = ChatGoogleGenerativeAI(model="gemini-pro",max_output_tokens=1024)
chain_gemini_vision = ChatGoogleGenerativeAI(model="gemini-pro-vision",max_output_tokens=1024)

# Function for text summaries
def summarize_text(text_element):
    prompt = f"Summarize the following text:\n\n{text_element}\n\nSummary:"
    response = chain_gemini_pro.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for table summaries
def summarize_table(table_element):
    prompt = f"Summarize the following table:\n\n{table_element}\n\nSummary:"
    response = chain_gemini_pro.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for image summaries
def summarize_image(encoded_image):
    prompt = HumanMessage(
        content=[
            {"type": "text", "text": "Describe the contents of this image."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encoded_image}"
                },
            },
        ]
    )
    response = chain_gemini_vision.invoke([prompt])
    return response.content

In [None]:
# Processing table elements with feedback and sleep
table_summaries = []
for i, te in enumerate(table_elements):
    summary = summarize_table(te)
    table_summaries.append(summary)
    print(f"{i + 1}th element of tables processed.")
    print(summary)

In [None]:
# Processing text elements with feedback and sleep
text_summaries = []
for i, te in enumerate(text_elements):
    summary = summarize_text(te)
    text_summaries.append(summary)
    print(f"{i + 1}th element of texts processed.")
    print(summary)

In [None]:
# Processing image elements with feedback and sleep
image_summaries = []
for i, ie in enumerate(image_elements):
    summary = summarize_image(ie)
    image_summaries.append(summary)
    print(f"{i + 1}th element of images processed.")
    print(summary)

In [None]:
import uuid

from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma



# Initialize the vector store and storage layer
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
store = InMemoryStore()
id_key = "doc_id"

# Initialize the retriever
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key)

# Function to add documents to the retriever
def add_documents_to_retriever(summaries, original_contents):
    doc_ids = [str(uuid.uuid4()) for _ in summaries]
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(summaries)
    ]
    retriever.vectorstore.add_documents(summary_docs)
    retriever.docstore.mset(list(zip(doc_ids, original_contents)))

In [None]:
# Add text summaries
add_documents_to_retriever(text_summaries, text_elements)

# Add table summaries
add_documents_to_retriever(table_summaries, table_elements)

# Add image summaries
add_documents_to_retriever(image_summaries, image_summaries) # hopefully real images soon

In [None]:
# We can retrieve this table
retriever.get_relevant_documents(
    " what is the market share of company in fast east in 2020?"
)

In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

template = """Answer the question based only on the following context, which can include text, images and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
chain.invoke(
     """what is the market share of company total in 2016?
     use the following information to answer the question:
'Year 2015 2016 2017 2018 2019 2020 2021 2022 2023 Americas 32% 32% 34% 33% 31% 35% 35% 35% 34% EME Asia 36% 36% 38% 37% 35% 39% 39% 39% 38% 29% 29% 31% 30% 28% 32% 32% 32% 31% Far East Total 32% 32% 34% 33% 31% 35% 35% 35% 34% 32% 32% 34% 33% 31% 35% 35% 35% 34%',
 'Year 2015 2016 2017 2018 2019 2020 2021 2022 2023 Americas 32% 32% 34% 33% 31% 35% 35% 35% 34% EME Asia 36% 36% 38% 37% 35% 39% 39% 39% 38% 29% 29% 31% 30% 28% 32% 32% 32% 31% Far East Total 32% 32% 34% 33% 31% 35% 35% 35% 34% 32% 32% 34% 33% 31% 35% 35% 35% 34%',
 'Company Market Share',
 'Company Market Share' """
)