### Importing the Libraries

In [1]:
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI
from langchain.retrievers import ContextualCompressionRetriever,EnsembleRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda,RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain.retrievers.document_compressors import LLMChainExtractor
from azure.search.documents.indexes.models import (SearchableField,SearchField,SearchFieldDataType,SimpleField)
from dotenv import load_dotenv

In [29]:
# Load enviornment variables
load_dotenv()

True

In [3]:
# OpenAI settings
openai_api_key: str = os.environ.get("OPENAI_KEY")
openai_api_version: str = "2023-05-15"

In [4]:
# Azure AI Search Settings
vector_store_address: str = os.environ.get("SEARCH_SERVICE")
vector_store_password: str = os.environ.get("AZURE_SEARCH_KEY")
index_name:str = "langchain-vector-demo"

## Setup LangSmith

In [5]:

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Langchain Test"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

In [6]:
from langsmith import Client

client = Client()

In [7]:
from langchain_benchmarks import clone_public_dataset, registry


langchain_docs = registry['LangChain Docs Q&A']
clone_public_dataset(langchain_docs.dataset_id,dataset_name=langchain_docs.name)

Dataset LangChain Docs Q&A already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/23802376-5d39-5c79-95a7-fc7c9d4cba86/datasets/96885c1f-33a5-4bda-ba87-abad69831234.


## Document Loaders

In [None]:
# PyPDF loader for parsing PDF files
loader = PyPDFLoader(r'Books\yolov7.pdf')
pages = loader.load()


In [154]:
print(pages[0])

page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan\nkinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\nAbstract\nYOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS\nand has the highest accuracy 56.8% AP among all known\nreal-time object detectors with 30 FPS or higher on GPU\nV100. YOLOv7-E6 object detector (56 FPS V100, 55.9%\nAP) outperforms both transformer-based detector SWIN-\nL Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by\n509% in speed and 2% in accuracy, and convolutional-\nbased detector ConvNeXt-XL Cascade-Mask R-CNN (8.6\nFPS A100, 55.2% AP) by 551% in speed and 0.7% AP\nin accuracy, as well as YOLOv7 outperforms: YOLOR,\nYOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable\nDETR, DINO-5scale-R50, ViT-Adapter-B and many othe

## Semantic Chunking

In [11]:
# Embedding Model Details
# asasa
model_name = "BAAI/bge-small-en"
model_kwargs = {"device":"cpu"}
encode_kwargs = {"normalize_embeddings":True}

hf = HuggingFaceBgeEmbeddings(model_name=model_name,model_kwargs=model_kwargs,encode_kwargs=encode_kwargs)

In [156]:
embedding_test = hf.embed_query("Hi this is krish")
len(embedding_test)

384

In [157]:
# Create Semantic Chunker
text_splitter = SemanticChunker(hf)
 

In [158]:
page_content = list(map(lambda x: x.page_content,pages))
metadata = list(map(lambda x: x.metadata,pages))

In [159]:

docs = text_splitter.create_documents(page_content)
print(docs[0].page_content)

YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object
detectors
Chien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1
1Institute of Information Science, Academia Sinica, Taiwan
kinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw
Abstract
YOLOv7 surpasses all known object detectors in both
speed and accuracy in the range from 5 FPS to 160 FPS
and has the highest accuracy 56.8% AP among all known
real-time object detectors with 30 FPS or higher on GPU
V100. YOLOv7-E6 object detector (56 FPS V100, 55.9%
AP) outperforms both transformer-based detector SWIN-
L Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by
509% in speed and 2% in accuracy, and convolutional-
based detector ConvNeXt-XL Cascade-Mask R-CNN (8.6
FPS A100, 55.2% AP) by 551% in speed and 0.7% AP
in accuracy, as well as YOLOv7 outperforms: YOLOR,
YOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable
DETR, DINO-5scale-R50, ViT-Adapter-B and many other
object detectors in speed and 

In [9]:
# Optional, run for benchmarking
docs = list(langchain_docs.get_docs())

File c:\Users\krish\OneDrive\Desktop\Coding Stuff\BTP\env\Lib\site-packages\langchain_benchmarks\rag\tasks\langchain_docs\indexing\db_docs/docs.parquet does not exist. Downloading from GCS...
File https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/docs.parquet downloaded.


## Build Vector Store

In [12]:
embedding_function = hf.embed_query

# Define fields for the Azure AI Search Index
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_profile_name="myHnswProfile",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
]

In [13]:
# Instatiate the vector store client
vector_store:AzureSearch = AzureSearch(azure_search_endpoint=vector_store_address,azure_search_key=vector_store_password,index_name=index_name,embedding_function=hf.embed_query,fields=fields)

In [14]:
vector_store.add_documents(documents=docs)

HttpResponseError: () The request is invalid. Details: The property 'source' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
Code: 
Message: The request is invalid. Details: The property 'source' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.

In [17]:
from langchain.vectorstores.chroma import Chroma
vectorstore = Chroma(
    collection_name="lcbm-b-huggingface-gte-base",
    embedding_function=hf,
    persist_directory="./chromadb",
)
vectorstore.add_documents(docs)

['9d5c967b-fcf7-11ee-94ad-c8e265ce3dc5',
 '9d5c967c-fcf7-11ee-b13f-c8e265ce3dc5',
 '9d5c967d-fcf7-11ee-ae65-c8e265ce3dc5',
 '9d5c967e-fcf7-11ee-ae63-c8e265ce3dc5',
 '9d5c967f-fcf7-11ee-a27a-c8e265ce3dc5',
 '9d5c9680-fcf7-11ee-a6ba-c8e265ce3dc5',
 '9d5c9681-fcf7-11ee-b056-c8e265ce3dc5',
 '9d5c9682-fcf7-11ee-9017-c8e265ce3dc5',
 '9d5c9683-fcf7-11ee-ab6c-c8e265ce3dc5',
 '9d5c9684-fcf7-11ee-8173-c8e265ce3dc5',
 '9d5c9685-fcf7-11ee-8211-c8e265ce3dc5',
 '9d5c9686-fcf7-11ee-81ad-c8e265ce3dc5',
 '9d5c9687-fcf7-11ee-a1dd-c8e265ce3dc5',
 '9d5c9688-fcf7-11ee-b600-c8e265ce3dc5',
 '9d5c9689-fcf7-11ee-bacf-c8e265ce3dc5',
 '9d5c968a-fcf7-11ee-9064-c8e265ce3dc5',
 '9d5c968b-fcf7-11ee-85d9-c8e265ce3dc5',
 '9d5c968c-fcf7-11ee-840b-c8e265ce3dc5',
 '9d5c968d-fcf7-11ee-8278-c8e265ce3dc5',
 '9d5c968e-fcf7-11ee-a4d6-c8e265ce3dc5',
 '9d5c968f-fcf7-11ee-ba9c-c8e265ce3dc5',
 '9d5c9690-fcf7-11ee-9393-c8e265ce3dc5',
 '9d5c9691-fcf7-11ee-ba64-c8e265ce3dc5',
 '9d5c9692-fcf7-11ee-9a95-c8e265ce3dc5',
 '9d5c9693-fcf7-

In [18]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" +
                d.page_content for i, d in enumerate(docs)]
        )
    )

In [174]:
# A test Query on the vector store
res = vector_store.similarity_search_with_relevance_scores(
 query="What is trainable bag of features?",
 k = 3,
 score_threshold=0.7
)
print(res)

[(Document(page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan'), 0.88353026), (Document(page_content='We call the proposed modules and optimization methods\ntrainable bag-of-freebies.\n1arXiv:2207.02696v1  [cs.CV]  6 Jul 2022'), 0.8826691), (Document(page_content='problem, we propose the trainable bag-of-freebies method\nto enhance the accuracy of object detection. Based on the\nabove, we have developed the YOLOv7 series of object de-\ntection systems, which receives the state-of-the-art results.\n7. Acknowledgements'), 0.87484545)]


## Retriever Model

#### MultiQuery Retriever

In [85]:
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
 retriever=vectorstore.as_retriever(),llm=llm
)

In [166]:
unique_docs = retriever_from_llm.get_relevant_documents( query="What is trainable bag of features?")

In [176]:
pretty_print_docs(unique_docs)

Document 1:

We call the proposed modules and optimization methods
trainable bag-of-freebies.
1arXiv:2207.02696v1  [cs.CV]  6 Jul 2022
----------------------------------------------------------------------------------------------------
Document 2:

YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object
detectors
Chien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1
1Institute of Information Science, Academia Sinica, Taiwan
----------------------------------------------------------------------------------------------------
Document 3:

can maintain the properties that the model had at the initial
design and maintains the optimal structure.
4. Trainable bag-of-freebies
4.1. Planned re-parameterized convolution
Although RepConv [13] has achieved excellent perfor-
----------------------------------------------------------------------------------------------------
Document 4:

problem, we propose the trainable bag-of-freebies method
to enhance the accuracy 

#### Contextual Compression Retriever

In [86]:
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor,base_retriever=vectorstore.as_retriever())


In [169]:
compressed_docs = compression_retriever.get_relevant_documents( query="What is trainable bag of features?")



In [175]:
pretty_print_docs(compressed_docs)

Document 1:

Trainable bag-of-freebies
----------------------------------------------------------------------------------------------------
Document 2:

trainable bag-of-freebies.
----------------------------------------------------------------------------------------------------
Document 3:

trainable bag-of-freebies method
----------------------------------------------------------------------------------------------------
Document 4:

trainable bag-of-freebies method


#### Ensemble Retriever 

In [87]:
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_from_llm,compression_retriever],weights=[0.5,0.5])

In [65]:
docs = ensemble_retriever.invoke("What is trainable bag of features?")



In [179]:
pretty_print_docs(docs)

Document 1:

Trainable bag-of-freebies
----------------------------------------------------------------------------------------------------
Document 2:

We call the proposed modules and optimization methods
trainable bag-of-freebies.
1arXiv:2207.02696v1  [cs.CV]  6 Jul 2022
----------------------------------------------------------------------------------------------------
Document 3:

YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object
detectors
Chien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1
1Institute of Information Science, Academia Sinica, Taiwan
----------------------------------------------------------------------------------------------------
Document 4:

trainable bag-of-freebies.
----------------------------------------------------------------------------------------------------
Document 5:

self-supervised learning or knowledge distillation methods
that require additional data or large model. Instead, we will
design new trainable ba

## Model Generation

#### Setting up the Tree-of-Thoughts Approach

In [100]:

template= """
The question is {question}. Answer it using only the context {context}
A:
"""
prompt = PromptTemplate(
 input_variables=["question","context"],
 template=template
)

In [89]:
template2 = """
Combine three different answers to provide the best solution out of all of them {answers}
A:
"""
prompt2 = PromptTemplate(
    input_variables=["answers"],
    template=template2
)

In [90]:
openai_api_key  = os.environ["OPENAI_API_KEY"]
openai_api_key

'sk-proj-CZsc7RwwDNtFtyQkTKarT3BlbkFJNd0nXlQ5n1MgR6hLGf2T'

In [104]:
model= ChatOpenAI(model="gpt-3.5-turbo")
model_parser = model | StrOutputParser()

In [95]:
def retriever_content(docs):
 return list(map(lambda x: x.page_content,docs))[:3]

In [101]:
from operator import itemgetter
chain = (

 {"context":(itemgetter("question")| ensemble_retriever) |RunnableLambda(retriever_content),"question":RunnablePassthrough()} |prompt| {"answers":model_parser} | prompt2 | {"answer":model_parser}

)

In [105]:
chain.invoke({"question": "how to run a runnable"})



{'answers': 'To run a `Runnable`, you can use the `invoke()` method on the instance of the `Runnable`. For example, if you have a `runnable` object, you can call `runnable.invoke(input_data)` to execute the `Runnable` with the provided input data. Additionally, you can also use `RunnableParallel` to execute multiple `Runnables` in parallel and return the output as a map. This allows you to parallelize steps and improve the efficiency of your workflow.'}

In [106]:
from langchain_benchmarks.rag import get_eval_config
RAG_EVALUATION = get_eval_config()

test_run = client.run_on_dataset(
    dataset_name=langchain_docs.name,
    llm_or_chain_factory=chain,
    evaluation=RAG_EVALUATION,
    project_name=f"btp results3",
    project_metadata={
        "index_method": "custom",
    },
    verbose=True,
)

HTTPError: [Errno 409 Client Error: Conflict for url: https://api.smith.langchain.com/sessions] {"detail":"Session already exists."}