In [2]:
import warnings
warnings.filterwarnings("ignore")

## Preprocessing

In [8]:
from unstructured import partition_pdf

pdf_file_name = ''

raw_elements = partition_pdf(
    filename= pdf_file_name,
    chunking_strategy='by_title',
    infer_table_structure= True,
    max_characters= 1000,
    new_after_n_chars=1500,
    combine_text_under_n_chars=250,
    strategy='hi_res'
)

pdf_file_path='Ikram-DS resume.pdf'
extract_images_from_pdf(pdf_file_path)

## Organize text from document

In [9]:
tables = []
texts = []

for element in raw_elements:
    if 'unstructured.documents.elements.Table' in str(type(element)):
        tables.appends(str(element))
    elif 'unstructured.documents.elements.CompositeElement' in str(type(element)):
        texts.appends(str(element))


## Use the Gemini API

In [10]:
import google.generativeai as genai
from dotenv import load_dotenv, find_dotenv
import os

_ = load_dotenv(find_dotenv()) # read local .env file
GOOGLE_API_KEY=os.getenv("GOOGLE_API_KEY")

genai.configure(api_key=GOOGLE_API_KEY)

## Generate Texts and Tables summaries

In [11]:
MODEL_NAME =  'models/gemini-1.5-pro-latest'
model = genai.GenertiveModel(model_name=MODEL_NAME)


def make_prompt(element):
    return f""" You are an assistant tasked with summarizing tables and text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text or table elements. \
    Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element}"""


def generate_text_summaries (texts, tables, summarize_texts = False):
    """
    Summarize text elements
    Args:

    texts:List of str
    tables:List of str
    summarize_texts: Bool to summarize texts
    """

    text_summaries, table_summaries = [], []
    if texts:
        if summarize_texts:
            for text in texts:
                prompt = make_prompt(text)
                response = model.generate_content(prompt)
                text_summaries.append(response.text)
        else:
            text_summaries = text
        
        if tables:
            for table in tables:
                prompt = make_prompt(table)
                response = model.generate_content(prompt)
                table_summaries.append(response.text)
        else:
            text_summaries = text
    return text_summaries, table_summaries


In [12]:
ext_summaries, table_summaries = generate_text_summaries(texts,tables)

## Generate Images summaries

In [13]:
def encode_image(image_path):
    """Encodes an image to a base64 string."""
    with open(image_path, "rb") as image_file:
        return base64,b64encode (image_file.read()).decode("utf-8")

def generate_image_summaries(image_directory):
    """Generates summaries for images in the specified directory."""
    img_base64_list = [] # Store base64 encoded images
    image_summaries = [] # Store image summaries
    model = genai.GenerativeModel('gemini-1.5-pro-latest')
    prompt = """You are an automotive assistant tasked with summarizing images for retrieval. \
    These summaries will be embedded and used to retrieve the raw image. \
    Describe concisely the characteristics (shape, color), but do not infer what the image means. \
    Only describe the characteristics of the image you see."""
    
    for filename in sorted(os.listdir(image_directory)):
        if filename.endswith(".png"):
            image_path = os.path.join(image_directory, filename)
            base64_image=encode_image(image_path)
            img_base64_list.append(base64_image)
            with PIL.Image.open(image_path) as img:
                response = model.generate_content([prompt, img])
                image=summaries.append(response.text)
            
    return image_summaries, img_base64_list

In [14]:
image_directory = ''
image_summaries, img_base64_list = generate_image_summaries(image_directory)

## Setup Vector Database

In [15]:
index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
display_name="rag_langchain_streaming_index",
dimensions=768,
approximate_neighbors_count=150,
leaf_node_embedding_count=500,
leaf_nodes_to_search_percent=7,
description="Multimodal RAG LangChain Stream Index",
index_update_method="stream_update")

In [16]:
index_endpoint = index_endpoint.deploy_index(
index=index, deployed_index_id="rag_langchain_deployed_streaming_index" )
index_endpoint.deployed_indexes

## Define a vector store with Langchain

In [17]:
# The vectorstore to use to index the summaries
vectorstore = VectorSearchVectorStore.from_components(
project_id=PROJECT_ID,
region=LOCATION,
gcs_bucket_name=GCS_BUCKET,
index_id=index_id,
endpoint_id=endpoint_id,
embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"),
stream_update=True
)

In [18]:
#Create the document store
docstore= InMemoryStore()
id_key = "doc_id"
#Create the multi-vector retriever
retriever_multi_vector_img = MultiVectorRetriever (
vectorstore=vectorstore,
docstore=docstore,
id_key=id_key,)

In [19]:
# Combine raw document contents
doc_contents= texts + tables + img_base64_list
doc_ids = [str(uuid.uuid4()) for _ in doc_contents] 
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(texts + table_summaries + image_summaries)
]
retriever_multi_vector_img.docstore.mset(list(zip(doc_ids, doc_contents)))
#Generate embeddings for all chunks and stream them to the vector stom
retriever_multi_vector_img.vectorstore.add_documents(summary_docs)

## Stage 2 Q&A Pipeline

In [20]:
from langchain_google_genai import ChatGoogleGenerativeAI
#Create RAG chain

chain_multimodal_rag = (
    {
    "context": retriever_multi_vector_img | RunnableLambda(split_image_text_types),
    "question": RunnablePassthrough(),
    }
    | RunnableLambda (img_prompt_func)
    | ChatGoogleGenerativeAI (
        temperature=0, model="gemini-1.5-pro-latest", max_output_tokens=1024
    #Multi-modal LLM
    )
    | StroutputParser()
)

In [21]:
#Multimodal search function
def multimodal_search(query: str) -> str:
    """Performs a multimodal search for a given query, retrieving relevant documents and invoking a chain for generating
        Args:
            query: The search query string.
        Returns:
            The final result generated by the chain.
    """
    #retriever_multi_vector_img: The retriever object for fetching relevant documents (images and text). 
    docs=retriever_multi_vector_img.invoke(query, limit=10)
    #split image_text_types: A function to split fetched documents into separate image and tex 
    source_docs = split_image_text_types(docs)
    print("-" * 80)

    print("Retrieved Text Sources:")
    print("=" * 80)
    for i, source in enumerate (source_docs["texts"]):
        source_without_linebreaks = source.replace("\n", "") # Remove line breaks
        print(f"Retrieved chunk {1+1}: {source_without_linebreaks}")
    for img_data in source_docs["images"]:
        try:
            print("\n")
            print('_'*80)
            print("\nRetrieved Images Matching Source Documents:")
            print("="*80)
            display (Image (base64.b64decode(img_data)))
        except (TypeError, binascii.Error):
            print("Error decoding or displaying an image. Skipping...")
    #chain_multimodal_rag: The chain object for processing and generating a result.
    result=chain_multimodal_rag.invoke(query)
    print("\n\n")
    print("="*80)
    print("RAG Pipeline Summarized Answer:")
    print("="*80)

    return result
