### Importing the Libraries

In [1]:
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI
from langchain.retrievers import ContextualCompressionRetriever,EnsembleRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda,RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain.retrievers.document_compressors import LLMChainExtractor
from azure.search.documents.indexes.models import (SearchableField,SearchField,SearchFieldDataType,SimpleField)
from dotenv import load_dotenv

In [148]:
# Load enviornment variables
load_dotenv()

True

In [146]:
# OpenAI settings
openai_api_key: str = os.environ.get("OPENAI_KEY")
openai_api_version: str = "2023-05-15"

In [147]:
# Azure AI Search Settings
vector_store_address: str = os.environ.get("SEARCH_SERVICE")
vector_store_password: str = os.environ.get("AZURE_SEARCH_KEY")
index_name:str = "langchain-vector-demo"

## Setup LangSmith

In [149]:

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Langchain Test"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

In [150]:
from langsmith import Client

client = Client()

## Document Loaders

In [153]:
# PyPDF loader for parsing PDF files
loader = PyPDFLoader(r'Books\yolov7.pdf')
pages = loader.load()

In [154]:
print(pages[0])

page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan\nkinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\nAbstract\nYOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS\nand has the highest accuracy 56.8% AP among all known\nreal-time object detectors with 30 FPS or higher on GPU\nV100. YOLOv7-E6 object detector (56 FPS V100, 55.9%\nAP) outperforms both transformer-based detector SWIN-\nL Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by\n509% in speed and 2% in accuracy, and convolutional-\nbased detector ConvNeXt-XL Cascade-Mask R-CNN (8.6\nFPS A100, 55.2% AP) by 551% in speed and 0.7% AP\nin accuracy, as well as YOLOv7 outperforms: YOLOR,\nYOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable\nDETR, DINO-5scale-R50, ViT-Adapter-B and many othe

## Semantic Chunking

In [155]:
# Embedding Model Details
model_name = "BAAI/bge-small-en"
model_kwargs = {"device":"cpu"}
encode_kwargs = {"normalize_embeddings":True}

hf = HuggingFaceBgeEmbeddings(model_name=model_name,model_kwargs=model_kwargs,encode_kwargs=encode_kwargs)

In [156]:
embedding_test = hf.embed_query("Hi this is krish")
len(embedding_test)

384

In [157]:
# Create Semantic Chunker
text_splitter = SemanticChunker(hf)
 

In [158]:
page_content = list(map(lambda x: x.page_content,pages))
metadata = list(map(lambda x: x.metadata,pages))

In [159]:

docs = text_splitter.create_documents(page_content)
print(docs[0].page_content)

YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object
detectors
Chien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1
1Institute of Information Science, Academia Sinica, Taiwan
kinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw
Abstract
YOLOv7 surpasses all known object detectors in both
speed and accuracy in the range from 5 FPS to 160 FPS
and has the highest accuracy 56.8% AP among all known
real-time object detectors with 30 FPS or higher on GPU
V100. YOLOv7-E6 object detector (56 FPS V100, 55.9%
AP) outperforms both transformer-based detector SWIN-
L Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by
509% in speed and 2% in accuracy, and convolutional-
based detector ConvNeXt-XL Cascade-Mask R-CNN (8.6
FPS A100, 55.2% AP) by 551% in speed and 0.7% AP
in accuracy, as well as YOLOv7 outperforms: YOLOR,
YOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable
DETR, DINO-5scale-R50, ViT-Adapter-B and many other
object detectors in speed and 

## Build Vector Store

In [161]:
embedding_function = hf.embed_query

# Define fields for the Azure AI Search Index
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_profile_name="myHnswProfile",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
]

In [162]:
# Instatiate the vector store client
vector_store:AzureSearch = AzureSearch(azure_search_endpoint=vector_store_address,azure_search_key=vector_store_password,index_name=index_name,embedding_function=hf.embed_query,fields=fields)

In [163]:
vector_store.add_documents(documents=docs)

['YzY4NmZhMWYtNDEyMy00Njg4LTkwNTUtNzI3YTg2NTE1OGZk',
 'OTkzNzk0Y2EtYmIyMC00ZDE5LWFjNjItYzRjNDQyM2QxM2M1',
 'MmRmZTVmNDUtZDVjOS00ZGI5LTk5ZGEtNTgyYWQ1ZTEwOGQz',
 'ZTFjMTQzOWQtZmUzYS00ZmZhLWI2OGItYjcyYTFmNWI3OTMz',
 'ZDVmN2JhYWQtNDg2OC00ZmM5LTk4M2UtNjZhZjA0NmExYjZm',
 'ZDlhNGUyOTktNTM3NS00YjViLTg5OGQtNWExY2NlMDA0ZTEw',
 'NzU0ODY0ZjctOTcwNS00YTQ0LWJmYzAtNjg0MGQ0ZTk4NzY3',
 'NWFiZGRhNTQtMTM3Yi00NWVkLThjNjItZGY1MTBhYmNiMjA5',
 'NTZjZmE5ZTAtNWU1NS00ZmY4LTgzMGMtYTQ4YmFjOGM3ZTQz',
 'N2ZlOTIzNzYtZjAxMy00ZmY5LWJlZDEtMGJjOThiZDAxMGFk',
 'OGIyN2IwMDgtOTlhYi00YzA2LTllOTktODFmMzUxZjUwYmIw',
 'NDZiMjIwNWItNTVhYy00MzMwLTkzYWItMjA2NjMzMWVkMGU1',
 'OGYyMGY5ZDctNGQ4ZC00MzQ3LWE0MWEtOTkxMDU0ZmQwYTc0',
 'Y2Y3YjQ3MjctZTBiMS00M2YzLWE3ZDctYTUwY2M5ZGE2YmQ3',
 'NTIzYzRiYWEtMmI3Ny00NTMzLThkNTAtZmU0Zjg0YmEyODRm',
 'OWM4NjM5MDctZGRjZi00NTI1LWI3NjUtNWY3MjgyMWJlMTFj',
 'ZTRjNGM2MjgtYmFlNS00MWY3LTk3NWMtNGM0MzVlZjdhMDUz',
 'YzM5OGRkNDEtYjJjOC00NGM0LTk2OTMtNzYxNTFmNDdiYWI2',
 'MzBiMjI0YWUtNjc0My00YTZjLTkwYWUtOWEwZWQ1YTI4

In [172]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" +
                d.page_content for i, d in enumerate(docs)]
        )
    )

In [174]:
# A test Query on the vector store
res = vector_store.similarity_search_with_relevance_scores(
 query="What is trainable bag of features?",
 k = 3,
 score_threshold=0.7
)
print(res)

[(Document(page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan'), 0.88353026), (Document(page_content='We call the proposed modules and optimization methods\ntrainable bag-of-freebies.\n1arXiv:2207.02696v1  [cs.CV]  6 Jul 2022'), 0.8826691), (Document(page_content='problem, we propose the trainable bag-of-freebies method\nto enhance the accuracy of object detection. Based on the\nabove, we have developed the YOLOv7 series of object de-\ntection systems, which receives the state-of-the-art results.\n7. Acknowledgements'), 0.87484545)]


## Retriever Model

#### MultiQuery Retriever

In [165]:
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
 retriever=vector_store.as_retriever(),llm=llm
)

In [166]:
unique_docs = retriever_from_llm.get_relevant_documents( query="What is trainable bag of features?")

In [176]:
pretty_print_docs(unique_docs)

Document 1:

We call the proposed modules and optimization methods
trainable bag-of-freebies.
1arXiv:2207.02696v1  [cs.CV]  6 Jul 2022
----------------------------------------------------------------------------------------------------
Document 2:

YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object
detectors
Chien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1
1Institute of Information Science, Academia Sinica, Taiwan
----------------------------------------------------------------------------------------------------
Document 3:

can maintain the properties that the model had at the initial
design and maintains the optimal structure.
4. Trainable bag-of-freebies
4.1. Planned re-parameterized convolution
Although RepConv [13] has achieved excellent perfor-
----------------------------------------------------------------------------------------------------
Document 4:

problem, we propose the trainable bag-of-freebies method
to enhance the accuracy 

#### Contextual Compression Retriever

In [168]:
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor,base_retriever=vector_store.as_retriever())


In [169]:
compressed_docs = compression_retriever.get_relevant_documents( query="What is trainable bag of features?")



In [175]:
pretty_print_docs(compressed_docs)

Document 1:

Trainable bag-of-freebies
----------------------------------------------------------------------------------------------------
Document 2:

trainable bag-of-freebies.
----------------------------------------------------------------------------------------------------
Document 3:

trainable bag-of-freebies method
----------------------------------------------------------------------------------------------------
Document 4:

trainable bag-of-freebies method


#### Ensemble Retriever 

In [177]:
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_from_llm,compression_retriever],weights=[0.5,0.5])

In [178]:
docs = ensemble_retriever.invoke("What is trainable bag of features?")



In [179]:
pretty_print_docs(docs)

Document 1:

Trainable bag-of-freebies
----------------------------------------------------------------------------------------------------
Document 2:

We call the proposed modules and optimization methods
trainable bag-of-freebies.
1arXiv:2207.02696v1  [cs.CV]  6 Jul 2022
----------------------------------------------------------------------------------------------------
Document 3:

YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object
detectors
Chien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1
1Institute of Information Science, Academia Sinica, Taiwan
----------------------------------------------------------------------------------------------------
Document 4:

trainable bag-of-freebies.
----------------------------------------------------------------------------------------------------
Document 5:

self-supervised learning or knowledge distillation methods
that require additional data or large model. Instead, we will
design new trainable ba

## Model Generation

#### Setting up the Tree-of-Thoughts Approach

In [181]:

template= """
Imagine three different experts are answering this question.
All experts will write down 1 step of their thinking,
then share it with the group.
Then all experts will go on to the next step, etc.
If any expert realises they're wrong at any point then they leave.
The question is {question}. Answer it using only the context {context}
A:
"""
prompt = PromptTemplate(
 input_variables=["question","context"],
 template=template
)

In [182]:
template2 = """
Combine three different answers to provide the best solution out of all of them {answers}
A:
"""
prompt2 = PromptTemplate(
    input_variables=["answers"],
    template=template2
)

In [183]:
model= ChatOpenAI(model="gpt-4")
model_parser = model | StrOutputParser()

In [184]:
def retriever_content(docs):
 return list(map(lambda x: x.page_content,docs))

In [185]:
chain = (
 {"context":ensemble_retriever |RunnableLambda(retriever_content),"question":RunnablePassthrough()} |prompt| {"answers":model_parser} | prompt2 | {"answer":model_parser}
)

In [186]:
chain.invoke("Describe the model arch of Yolov7?")

