In [151]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pdf_path = "./2307.06435v9.pdf"
pdf_loader = PyPDFLoader(pdf_path)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
text_docs = text_splitter.split_documents(pdf_loader.load())l

In [152]:
from langchain_community.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings
import os
from dotenv import load_dotenv
load_dotenv()
embeddings = OpenAIEmbeddings( model="text-embedding-3-small")
vector_store =  Chroma.from_documents(text_docs, embeddings)

In [153]:
# import torch
# from transformers import CLIPProcessor, CLIPModel


# model_name="openai/clip-vit-base-patch16"
# model = CLIPModel.from_pretrained(model_name)
# processor = CLIPProcessor.from_pretrained(model_name)
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)





# import numpy as np
# def find_top_matches(query_embedding, embeddings, top_k=3):
#     # Calculate cosine similarities
#     similarities = np.array([np.dot(query_embedding, emb) / (np.linalg.norm(query_embedding) * np.linalg.norm(emb)) for emb in embeddings])
#     # Get indices of top matches
#     top_indices = np.argsort(similarities)[-3:][::-1]
#     # Get top_k similarities
#     top_similarities = similarities[top_indices]
#     return list(zip(top_indices, top_similarities))

In [154]:
import numpy as np
import fitz
import io
from PIL import Image
def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            images.append(image)
    return images

In [155]:
extracted_images = extract_images_from_pdf(pdf_path)

In [156]:
from langchain_experimental.open_clip import OpenCLIPEmbeddings
# import open_clip
# print(open_clip.list_pretrained())
openclip_embeddings = OpenCLIPEmbeddings(model_name="ViT-B-16",checkpoint="openai")
in_memory_images = extracted_images
img_features = [openclip_embeddings.preprocess(image).unsqueeze(0) for image in in_memory_images]
img_vector_store = np.array([openclip_embeddings.model.encode_image(feature).detach().numpy().squeeze(0) for feature in img_features])


In [157]:
from PIL import Image
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def find_top_matches(text_features_np, img_vector_store, top_k=3):
    similarity = cosine_similarity(text_features_np, img_vector_store)
    similarity = similarity.squeeze(0)
    top_indices = np.argsort(similarity)[-3:][::-1]
    top_scores = similarity[top_indices]
    top_images_scores = list(zip(top_indices, top_scores))
    return top_images_scores


In [166]:
# query = "This section reviews LLMs, briefly describing their architectures, training objectives, pipelines, datasets, and fine-tuning details?"
# query="microsoft logo"
query = "microsoft and the paper about LLms and all research paper about ?"
text_features = openclip_embeddings.embed_documents([query])
text_features_np = np.array(text_features)


top_3_images_indices_with_scores = find_top_matches(text_features_np, img_features_np, top_k=3)

print(top_3_images_indices_with_scores)

top_3_texts = vector_store.similarity_search_with_relevance_scores(query, k=5)

print(top_3_texts)
for txt in top_3_texts: print(txt[1])

[(0, 0.27710183224840196), (7, 0.26018049791447523), (35, 0.24978073631841577)]
[(Document(page_content='beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\ntopics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\nrobotics, datasets, benchmarking, e fficiency, and more. With the rapid development of techniques and regular breakthroughs in', metadata={'page': 0, 'source': './2307.06435v9.pdf'}), 0.3075905835324745), (Document(page_content='beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\ntopics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\nrobotics, datasets, benchmarking, e fficiency, and more. With the rapid development of techniques and regular breakthroughs in', metadata={'pa

In [162]:
import numpy
from langchain.chat_models.openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
import base64
from io import BytesIO

combined_results = top_3_images_indices_with_scores + top_3_texts
# print(combined_results)
combined_results.sort(key=lambda x: x[1], reverse=True)
print(combined_results)
top_3_results = combined_results[:3]
print(f'top 3 results after reranking : {top_3_results}')

vision_model = ChatOpenAI(temperature=0.5, model="gpt-4-vision-preview", max_tokens=1024)

final_results_that_will_be_feeded_to_gpt4 = []

for i,result in enumerate(top_3_results):
    # print(type(result[0]))
    if isinstance(result[0], numpy.int64):
        image = extracted_images[result[0]]
        # images[result[0]].show()
        buffered = BytesIO()
        image.save(buffered, format="PNG")
        img_str = base64.b64encode(buffered.getvalue()).decode()
        # send the image to gpt 4 vision
        out = vision_model.invoke([SystemMessage(content=f'You should extract any relevant information to this query : {query} from the image')
            ,HumanMessage(content=[
            {"type": "image", "image": img_str}
        ])])
        print(out.content)
        final_results_that_will_be_feeded_to_gpt4.append(out.content)
        
    else:
        print(result[0].page_content)   
        final_results_that_will_be_feeded_to_gpt4.append(result[0].page_content)

[(Document(page_content='beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\ntopics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\nrobotics, datasets, benchmarking, e fficiency, and more. With the rapid development of techniques and regular breakthroughs in', metadata={'page': 0, 'source': './2307.06435v9.pdf'}), 0.3075905835324745), (Document(page_content='beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\ntopics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\nrobotics, datasets, benchmarking, e fficiency, and more. With the rapid development of techniques and regular breakthroughs in', metadata={'page': 0, 'source': './2307.06435v9.pdf'}), 0.3075905835324745), (Document(page_co

In [168]:
print(final_results_that_will_be_feeded_to_gpt4)
results_str = "\n".join(final_results_that_will_be_feeded_to_gpt4)
# send the final results to gpt 4
gpt4_model = ChatOpenAI(temperature=0, model="gpt-4o", max_tokens=1024)
out = gpt4_model.invoke([SystemMessage(content=f'use these contexts to respond to the user query {results_str}'),HumanMessage(content=query)])
print(out.content)

['beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\ntopics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\nrobotics, datasets, benchmarking, e fficiency, and more. With the rapid development of techniques and regular breakthroughs in', 'beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\ntopics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\nrobotics, datasets, benchmarking, e fficiency, and more. With the rapid development of techniques and regular breakthroughs in', 'beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\ntopics such as architectural innovations, better training strategies, context lengt