<a href="https://colab.research.google.com/github/sunnysavita10/Indepth-GENAI/blob/main/Hybrid_Search_in_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Sample documents
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [None]:
query="keyword-based search"

In [None]:
import re
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text


In [None]:
preprocess_documents=[preprocess_text(doc) for doc in documents]

In [None]:
preprocess_documents

In [None]:
print("Preprocessed Documents:")
for doc in preprocess_documents:
    print(doc)

In [None]:
print("Preprocessed Query:")
print(query)

In [None]:
preprocessed_query = preprocess_text(query)

In [None]:
preprocessed_query

In [None]:
vector=TfidfVectorizer()

In [None]:
X=vector.fit_transform(preprocess_documents)

In [None]:
X.toarray()

In [None]:
X.toarray()[0]

In [None]:
query_embedding=vector.transform([preprocessed_query])

In [None]:
query_embedding.toarray()

In [None]:
similarities = cosine_similarity(X, query_embedding)

In [None]:
similarities

In [None]:
np.argsort(similarities,axis=0)

In [None]:
ranked_documents = [documents[i] for i in ranked_indices]

In [None]:
#Ranking
ranked_indices=np.argsort(similarities,axis=0)[::-1].flatten()

In [None]:
ranked_indices


In [None]:
# Output the ranked documents
for i, doc in enumerate(ranked_documents):
    print(f"Rank {i+1}: {doc}")

In [None]:
query

In [None]:
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [None]:
#https://huggingface.co/sentence-transformers

In [None]:
document_embeddings = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])

In [None]:
# Sample search query (represented as a dense vector)
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [None]:
# Calculate cosine similarity between query and documents
similarities = cosine_similarity(document_embeddings, query_embedding)

In [None]:
similarities

In [None]:
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()

In [None]:
ranked_indices

In [None]:
# Output the ranked documents
for i, idx in enumerate(ranked_indices):
    print(f"Rank {i+1}: Document {idx+1}")

In [None]:
doc_path="/content/Retrieval-Augmented-Generation-for-NLP.pdf"

In [None]:
!pip install pypdf

In [None]:
!pip install langchain_community

In [None]:
from langchain_community.document_loaders import PyPDFLoader

In [None]:
loader=PyPDFLoader(doc_path)

In [None]:
docs=loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)

In [None]:
chunks = splitter.split_documents(docs)

In [None]:
chunks

In [None]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [None]:
HF_TOKEN=""  # Replace with your Hugging Face API token

In [None]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5")

In [None]:
!pip install chromadb

In [None]:
from langchain.vectorstores import Chroma

In [None]:
vectorstore=Chroma.from_documents(chunks,embeddings)

In [None]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

In [None]:
vectorstore_retreiver

In [None]:
!pip install rank_bm25

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [None]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [None]:
keyword_retriever.k =  3

In [None]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])

# Mixing vector search and keyword search for Hybrid search

## hybrid_score = (1 â€” alpha) * sparse_score + alpha * dense_score

In [None]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [None]:
!pip install bitsandbytes

In [None]:
!pip install accelerate

In [None]:
import torch
from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, )
from langchain import HuggingFacePipeline

In [None]:
# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    model_name: Name or path of the model to be loaded.
    return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
    return model

In [None]:
# initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    model_name: Name or path of the model for tokenizer initialization.
    return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer

In [None]:
tokenizer = initialize_tokenizer(model_name)

In [None]:
model = load_quantized_model(model_name)

In [None]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

In [None]:
llm = HuggingFacePipeline(pipeline=pipeline)

In [None]:
from langchain.chains import RetrievalQA

In [None]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

In [None]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

In [None]:
response1 = normal_chain.invoke("What is Abstractive Question Answering?")

In [None]:
response1

In [None]:
print(response1.get("result"))

In [None]:
response2 = hybrid_chain.invoke("What is Abstractive Question Answering?")

In [None]:
response2

In [None]:
print(response2.get("result"))