#**RAG Fusion - The New Star of Search Technology**

In [None]:
# Installing the required dependencies
!pip install langchain openai lancedb pypdf tiktoken

In [48]:
import os

os.environ["OPENAI_API_KEY"] = "sk-"

In [49]:
# Importing Required Dependencies
import openai
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import LanceDB
import tiktoken

In [50]:
# Downloading the files
!wget https://ncert.nic.in/ncerts/l/leph202.pdf -O doc.pdf

--2024-01-14 17:19:12--  https://ncert.nic.in/ncerts/l/leph202.pdf
Resolving ncert.nic.in (ncert.nic.in)... 164.100.166.133
Connecting to ncert.nic.in (ncert.nic.in)|164.100.166.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3530023 (3.4M) [application/pdf]
Saving to: ‘doc.pdf’


2024-01-14 17:19:15 (1.93 MB/s) - ‘doc.pdf’ saved [3530023/3530023]



Splitting our documents into chunks.

In [51]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

# Load the  pdf
pdf_folder_path = "/content/doc.pdf"

loader = PyPDFLoader(pdf_folder_path)
docs = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
)
documents = text_splitter.split_documents(docs)

In [52]:
embeddings = OpenAIEmbeddings()

###Using **LANCE DB** vector store for store and retreive embeddings.

In [53]:
from langchain.vectorstores import LanceDB
import lancedb

# lancedb as vectorstore
db = lancedb.connect("/tmp/lancedb")
table = db.create_table(
    "documents",
    data=[
        {
            "vector": embeddings.embed_query("Hello World"),
            "text": "Hello World",
            "id": "1",
        }
    ],
    mode="overwrite",
)
vector_store = LanceDB.from_documents(documents, embeddings, connection=table)

In [54]:
# Data stored in vector database in table form.
table.to_pandas()

Unnamed: 0,vector,text,id
0,"[-0.0070581236, 0.0034781466, -0.00699448, -0....",Hello World,1
1,"[0.020712305, 0.019299496, 0.004311731, -0.001...",351Wave Optics\nChapter Ten\nWAVE OPTICS\n10.1...,6b886395-afcc-44ce-94f6-e540c8c22c37
2,"[0.008747051, 0.016323444, 0.003442918, -0.003...","the tremendous popularity of this book, the co...",977cd192-8a7c-49c0-94df-07387e81f768
3,"[0.007830143, 0.010499509, -0.007494, -0.00330...",would be less in the second medium. This is in...,6bc8a7fb-f25d-449e-aa78-8b54bae5be69
4,"[0.016653322, -0.006793313, -0.0043315454, -0....",© NCERT \nnot to be republished,32ee65ad-acb0-4829-b248-d7920a151bed
...,...,...,...
190,"[0.025289346, 0.0011775766, -0.013505305, 0.01...",of the obstacle. Explain wh y?\n(d)Two student...,ebdcd954-2c9c-435e-aa5e-8a00d628c401
191,"[0.012682365, 0.0014918466, -0.016420325, -0.0...","385Wave Optics\nobstacles, how is it that the ...",a53fc037-8657-466c-a5eb-7d9b9e39e003
192,"[0.012329977, -0.0045882817, -0.00632002, 0.00...",justification?\n10.18 Two towers on top of two...,12fea4e4-7e6d-429e-b4ed-af22c60e155d
193,"[0.020282593, 0.006688379, 3.619097e-05, -0.00...",mm from the centre of the screen. Find the wid...,6d3fd553-980b-43db-8f9c-7102cd51e822


Generating different queries relevant to the original query given by user.

In [36]:
def generate_queries_chatgpt(original_query):
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that generates multiple search queries based on a single input query.",
            },
            {
                "role": "user",
                "content": f"Generate multiple search queries related to: {original_query}",
            },
            {"role": "user", "content": "OUTPUT (4 queries):"},
        ],
    )
    generated_queries = response.choices[0].message.content.strip().split("\n")
    return generated_queries

Search relevant documents related to query in vector store.

In [37]:
def vector_search(query):
    search_results = {}
    retrieved_docs = vector_store.similarity_search(query)
    for i in retrieved_docs:
        search_results[i.page_content] = i.metadata["_distance"]
    search_results = {
        doc: score for doc, score in sorted(search_results.items(), key=lambda x: x[1])
    }
    return search_results

##Major Component of the RAG Fusion - **Reciprocal Rank Fusion Algorithm**
>This algorithm ranks documents on the basis of similarity to the query.

In [38]:
def reciprocal_rank_fusion(
    search_results_dict, k=60
):  # k=60 taken for optimum results according to paper.
    fused_scores = {}
    print("Initial individual search result ranks:")
    for query, doc_scores in search_results_dict.items():
        print(f"For query '{query}': {doc_scores}")

    for query, doc_scores in search_results_dict.items():
        for rank, (doc, score) in enumerate(
            sorted(doc_scores.items(), key=lambda x: x[1])
        ):
            if doc not in fused_scores:
                fused_scores[doc] = 0
            previous_score = fused_scores[doc]
            fused_scores[doc] += 1 / (rank + k)
            print(
                f"Updating score for {doc} from {previous_score} to {fused_scores[doc]} based on rank {rank} in query '{query}'"
            )

    reranked_results = {
        doc: score
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    }
    print("Final reranked results:", reranked_results)
    return reranked_results

Generating output based on the reranked documents.

In [39]:
def generate_output(original_query, reranked_results):
    reranked_docs = [i for i in reranked_results.keys()]
    context = "\n".join(reranked_docs)
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that answers user's questions based on the context provided.\nDo not make up an answer if you do not know it, stay within the bounds of the context provided, if you don't know the answer, say that you don't have enough information on the topic!",
            },
            {"role": "user", "content": f"CONTEXT: {context}\nQUERY: {original_query}"},
            {"role": "user", "content": "ANSWER:"},
        ],
    )

    response = response.choices[0].message.content.strip()
    return response

Now on to the final generation part with respect to queries given by user.

In [55]:
original_query = "Huygens Principle"
generated_queries = generate_queries_chatgpt(original_query)

In [56]:
generated_queries

['1. What is Huygens principle and how does it apply to wave propagation?',
 '2. Key contributions of Huygens principle in the field of optics.',
 '3. Examples of real-world phenomena that can be explained using Huygens principle.',
 '4. How does Huygens principle relate to the diffraction and interference of light?']

In [57]:
# Vector Search and document retreival for all the generated queries.
all_results = {}
for query in generated_queries:
    search_results = vector_search(query)
    all_results[query] = search_results

In [58]:
# Documents reranked accordig to RRF.
reranked_results = reciprocal_rank_fusion(all_results)

Initial individual search result ranks:
For query '1. What is Huygens principle and how does it apply to wave propagation?': {'Physics\n354small portion of the sphere can be considered as a plane and we have\nwhat is known as a plane wave  [Fig. 10.1(b)].\nNow, if we know the shape of the wavefront at t = 0, then Huygens\nprinciple allows us to determine the shape of the wavefront at a later\ntime τ. Thus, Huygens principle is essentially a geometrical construction,\nwhich given the shape of the wafefront at any time allows us to determine\nthe shape of the wavefront at a later time. Let us consider a diverging': 0.24516336619853973, 'consistent with experiment?\n10.13 You have lear nt in the text how Huygens’ principle leads to the laws\nof reflection and refraction. Use the same principle to deduce directly\nthat a point object placed in front of a plane mirror produces a\nvirtual image whose distance from the mirror is equal to the object\ndistance fr om the mirr or.\n10.14 Let us l

In [59]:
final_output = generate_output(original_query, reranked_results)

In [60]:
final_output

"Huygens' principle is a principle of wave propagation that states that each point on a wavefront can be thought of as a source of secondary wavelets that spread out in all directions. These secondary wavelets combine together to form the shape of the wavefront at a later time. It can be used to explain phenomena such as reflection, refraction, interference, diffraction, and polarization."