In [None]:
! pip install pypdf langchain_community chromadb rank_bm25 bitsandbytes accelerate

^C


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfing, pipeline)
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA


Some Document samples

In [None]:
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [None]:
query = "keyword-based search"

In [None]:
def preprocess_text(text):

    # Converting text to lower
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', "", text)

    return text

In [None]:
preprocess_data = [preprocess_text(doc) for doc in documents ]

In [None]:
preprocess_data

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [None]:
preprocess_query = preprocess_text(query)

In [None]:
vector = TfidfVectorizer()

In [None]:
data_vector = vector.fit_transform(preprocess_data)

In [None]:
data_vector.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [None]:
query_embedding = vector.transform([preprocess_query])

In [None]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [None]:
similarities = cosine_similarity(data_vector, query_embedding)

In [None]:
similarities

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

In [None]:
np.argsort(similarities, axis=0)[::-1]

array([[1],
       [3],
       [2],
       [0]], dtype=int64)

In [None]:
np.argsort(similarities, axis=0)[::-1].flatten()

array([1, 3, 2, 0], dtype=int64)

In [None]:
# Ranking
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()

In [None]:
ranked_documents = [documents[i] for i in ranked_indices]

In [None]:
# Output the ranked documents
for i, doc in enumerate(ranked_documents):
    print(query)
    print(f"Rank {i + 1}: {doc}")

keyword-based search
Rank 1: Keywords are important for keyword-based search.
keyword-based search
Rank 2: Keyword-based search relies on sparse embeddings.
keyword-based search
Rank 3: Document analysis involves extracting keywords.
keyword-based search
Rank 4: This is a list which containig sample documents.


In [None]:
document_embeddings_1 = np.array([
[0.634, 0.234, 0.867, 0.042, 0.249],
[0.123, 0.456, 0.789, 0.321, 0.654],
[0.987, 0.654, 0.321, 0.123, 0.456]
])

In [None]:
# Sample search query (represented as a dence vector)
query_embedding_1 = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [None]:
similarities_1 = cosine_similarity(document_embeddings_1, query_embedding_1)

In [None]:
ranked_indices_1 = np.argsort(similarities_1, axis=0)[::-1].flatten()

In [None]:
ranked_indices_1

array([0, 2, 1], dtype=int64)

In [None]:
# Output ranked documents
for i, idx in enumerate(ranked_indices_1):
    print(f"Rank: {i+1}: Document: {idx+1}")

Rank: 1: Document: 1
Rank: 2: Document: 3
Rank: 3: Document: 2


In [None]:
doc_path = "/content/Retrieval-Augmented-Generation-for-NLP.pdf"

In [None]:
loader = PyPDFLoader(doc_path)

In [None]:
docs = loader.load()

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200 , chunk_overlap=30)

In [None]:
chunks = splitter.split_documents(docs)

In [None]:
HF_TOKEN = "HUGGINGFACE_TOKEN"

In [None]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name="BAAI/bge-base-en-vl.5")

In [None]:
vector_store = Chroma.from_documents(chunks, embeddings)

In [None]:
vector_store_retriever = vector_store.as_retriever(search_kwargs={"k":3})

In [None]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [None]:
keyword_retriever.k = 3

In [None]:
ensembel_retriever = EnsembleRetriever(retrievers=[vector_store_retriever, keyword_retriever], weights=[0.3, 0.7])

In [None]:
model = "HuggingFaceH4/zephyr-7b-beta"

In [6]:
# Function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    model_name: Name or path of the model to be loaded.
    return : Loaded quantized model.
    
    """

    bnb_confing = BitsAndBytesConfing(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretraind(
        model_name,
        torch_dtype = torch.bfloat16,
        quantization_config = bnb_confing
    )

    return model

In [7]:
# Initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    model_name: name or path of the model for tokenizer initialization
    return: Initialized tokenizer
    
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1 # Set beginning of sentence token id
    return tokenizer

In [None]:
tokenizer = initialize_tokenizer(model)

In [None]:
model_1 = load_quantized_model(model)

In [None]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k = 5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

In [None]:
llm = HuggingFacePipeline(pipeline=pipeline)

In [None]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vector_store_retriever
)

In [None]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensembel_retriever
)

In [None]:
response_1 = normal_chain.invoke("What is RAG token model?")

In [None]:
print(response_1.get("result"))

In [None]:
response_2 = hybrid_chain.invoke("What is RAG token model?")

In [None]:
print(response_2.get("result"))