<a href="https://colab.research.google.com/github/hargurjeet/Adhoc-Activities/blob/main/RAG_with_oops_concepts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q langchain langchain-community pypdf faiss-cpu sentence-transformers transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m108.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
# start_rag_here.py

from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import os
from typing import List

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.document_loaders import PyPDFLoader



class DocumentIngestor:
    def __init__(self, chunk_size=500, chunk_overlap=50):
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )

    def load_and_chunk(self, file_path: str) -> List[Document]:
        loader = PyPDFLoader(file_path)
        raw_docs = loader.load()
        return self.splitter.split_documents(raw_docs)


class Embedder:
    def __init__(self):
        self.embedding_model = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

    def embed_documents(self, documents: List[Document]) -> FAISS:
        return FAISS.from_documents(documents, self.embedding_model)

llm_pipeline = pipeline("text-generation", model="google/flan-t5-small")
llm = HuggingFacePipeline(pipeline=llm_pipeline)

class Retriever:
    def __init__(self, vector_store: FAISS):
        self.retriever = vector_store.as_retriever()

    def retrieve(self, query: str, top_k: int = 5) -> List[Document]:
        return self.retriever.get_relevant_documents(query)[:top_k]


class Scorer:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model

    def score_documents(self, query: str, docs: List[Document]) -> List[Document]:
        query_emb = self.embedding_model.embed_query(query)
        scored = []
        for doc in docs:
            doc_emb = self.embedding_model.embed_query(doc.page_content)
            score = self.cosine_similarity(query_emb, doc_emb)
            doc.metadata['score'] = score
            scored.append(doc)
        return sorted(scored, key=lambda d: d.metadata['score'], reverse=True)

    def cosine_similarity(self, vec1, vec2):
        from numpy import dot
        from numpy.linalg import norm
        return dot(vec1, vec2) / (norm(vec1) * norm(vec2))


class RAGPipeline:
    def __init__(self, file_path: str):
        self.ingestor = DocumentIngestor()
        self.embedder = Embedder()

        print("Loading and chunking documents...")
        chunks = self.ingestor.load_and_chunk(file_path)

        print("Embedding documents...")
        self.vector_store = self.embedder.embed_documents(chunks)
        self.retriever = Retriever(self.vector_store)
        self.scorer = Scorer(self.embedder.embedding_model)

    def run(self, query: str, top_k: int = 3):
        print("Retrieving relevant documents...")
        retrieved_docs = self.retriever.retrieve(query, top_k=top_k * 2)

        print("Scoring documents...")
        scored_docs = self.scorer.score_documents(query, retrieved_docs)

        print("Top N Documents:")
        for i, doc in enumerate(scored_docs[:top_k]):
            print(f"\nRank {i+1} (Score: {doc.metadata['score']:.4f}):")
            print(doc.page_content[:300])  # limit output for brevity


# === Example Run ===
if __name__ == "__main__":
    file_path = "/content/Hargurjeet _GenAI_Resume.pdf"  # replace with your document
    query = "tell me about hargurjeet"
    pipeline = RAGPipeline(file_path)
    pipeline.run(query)


Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNe

Loading and chunking documents...
Embedding documents...
Retrieving relevant documents...
Scoring documents...


  return self.retriever.get_relevant_documents(query)[:top_k]


Top N Documents:

Rank 1 (Score: 0.3369):
Hargurjeet  Singh  Ganger  gurjeet333@gmail.com  |  +91  9035828125  |  Bangalore,  India  linkedin.com/in/hargurjeet/  |  github.com/hargurjeet |  gurjeet333.medium.com  
Summary  
 
Experienced  IT  professional  with  14+  years  in  the  industry,  specializing  in  data  science,  statistical  

Rank 2 (Score: 0.1609):
Warehousing/ETL
 
and
 
relational
 
Databases
.
 
 ●  Applied  statistics  techniques  such  as  classiﬁcation,  clustering,  statistical  inference ,  and  understanding  of  
central
 
tendency.
 
Worked
 
with
 
DL
 
libraries
 
scikit-learn,
 
TensorFlow,
 
Keras,
 
PyTorch...etc.
 ●  Experienc

Rank 3 (Score: 0.1379):
EDUCATION   LIVERPOOL  JOHN  MOORES  UNIVERSITY                                                                                       2023  -  2025  M.S.  in  Machine  Learning  &  Artiﬁcial  Intelligence    International  Institute  of  Information  Technology  Bangalore                            
