In [1]:
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from typing import List
from pydantic import BaseModel
from sentence_transformers import CrossEncoder

from langchain.schema import Document, BaseRetriever

from langchain_core.runnables import RunnableParallel, RunnablePassthrough

import os 
os.environ['GROQ_API_KEY']=os.getenv("GROQ_API_KEY")
loader = TextLoader("../data/info.txt")
documents = loader.load()



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
docs = text_splitter.split_documents(documents)



In [3]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
vectorstore = FAISS.from_documents(docs, embeddings)





  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")


In [4]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs)
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs

class RerankRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    top_k: int = 5

    def _get_relevant_documents(self, query: str) -> List[Document]:
        initial_docs = self.base_retriever.invoke(query)
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs[:self.top_k]

base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=3)


In [5]:
llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",streaming=True
)

from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    template="""
You are a smart and knowledgeable AI assistant helping users understand the professional background, projects, skills, and certifications of Katta Sai Pranav Reddy.

Use the following context extracted from Pranav's profile and provide a clear, helpful, and detailed answer.

Context:
{context}

Question: {question}
Helpful Answer:""",
    input_variables=["context", "question"]
)
rag_chain = (
    RunnableParallel({
        "context": custom_retriever,
        "question": RunnablePassthrough()
    })
    | prompt
    | llm
    | StrOutputParser()
)




In [7]:
# --- 8. Run with streaming ---
query = "Give me contact Info"

print("Answer (streaming):")
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)


Answer (streaming):
To get in touch with Katta Sai Pranav Reddy, you can use the following contact information:

* Phone: +91 93475 41040
* Email: [kattapranavreddy@gmail.com](mailto:kattapranavreddy@gmail.com)
* GitHub: [github.com/ka1817](https://github.com/ka1817)
* LinkedIn: [linkedin.com/in/pranav-reddy-katta](https://www.linkedin.com/in/pranav-reddy-katta/)

Feel free to reach out to him through any of these channels.

In [10]:
query = "Give me the github link of pranav reddy"

print("Answer (streaming):")
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)


Answer (streaming):
The GitHub link of Pranav Reddy is not explicitly provided, but the GitHub repository for his project, "BigBasket-SmartCart-AI-Assistant-for-BigBasket-Shopping", can be accessed through the link: 🔍 GitHub Repo: BigBasket-SmartCart-AI-Assistant-for-BigBasket-Shopping. 

However, a more direct link is not provided in the context. Nevertheless, another GitHub link for his personal project, "Netflix Customer Churn Prediction – End-to-End ML System", is mentioned as *[GitHub]*, but the actual link is not provided. 

If you are looking to access Pranav Reddy's GitHub profile or repositories, you may need to search for his username, which could be "pranavreddy123" based on his DockerHub repository (🐳 DockerHub: pranavreddy123/bigbasket-assistant).

In [4]:
query = "How He Deployed Netflix Churn Prediction Project"

print("Answer (streaming):")
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)


Answer (streaming):
Katta Sai Pranav Reddy deployed the Netflix Customer Churn Prediction project using a containerized approach with Docker. The project utilizes a production-grade, explainable, and reproducible Machine Learning (ML) pipeline that incorporates various tools and technologies for efficient deployment.

Here's an overview of the deployment process:

1. **CI/CD**: The project implements Continuous Integration/Continuous Deployment (CI/CD) to ensure seamless and automated testing, building, and deployment of the ML model.
2. **Experiment Tracking (MLflow)**: MLflow is used to track experiments, manage models, and monitor performance. This allows for easy comparison of different models, hyperparameters, and experiment results.
3. **Data Versioning (DVC)**: DVC is used for data versioning, which enables the tracking of changes to the data and ensures reproducibility of the results.
4. **Docker**: The project is containerized using Docker, which provides a lightweight and por

In [None]:
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from typing import List
from pydantic import BaseModel
from sentence_transformers import CrossEncoder

from langchain.schema import Document, BaseRetriever
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

import faiss
import numpy as np
import os

if not os.getenv("GROQ_API_KEY"):
    raise ValueError("Please set GROQ_API_KEY in your environment")


loader = TextLoader("../data/info.txt")
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
docs = text_splitter.split_documents(documents)
texts = [d.page_content for d in docs]


embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
vectors = embeddings.embed_documents(texts)  
dim = len(vectors[0])


nlist = 5   
quantizer = faiss.IndexFlatL2(dim)
index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_L2)

index.train(np.array(vectors).astype("float32"))

# Add vectors
index.add(np.array(vectors).astype("float32"))

index.nprobe = 10


from langchain.vectorstores.faiss import FAISS

docstore = {}
index_to_docstore_id = {}
for i, doc in enumerate(docs):
    doc_id = str(i)
    docstore[doc_id] = doc
    index_to_docstore_id[i] = doc_id

vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
)


reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs).tolist()
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs


class RerankRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    top_k: int = 5

    class Config:
        arbitrary_types_allowed = True  

    def _get_relevant_documents(self, query: str) -> List[Document]:
        initial_docs = self.base_retriever.get_relevant_documents(query)
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs[:self.top_k]


base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=3)


llm = ChatGroq(model_name="llama-3.3-70b-versatile", streaming=True)


prompt = PromptTemplate(
    template="""
You are a smart and knowledgeable AI assistant helping users understand the professional background, projects, skills, and certifications of Katta Sai Pranav Reddy.

Use the following context extracted from Pranav's profile and provide a clear, helpful, and detailed answer.

Context:
{context}

Question: {question}

Helpful Answer:""",
    input_variables=["context", "question"]
)


rag_chain = (
    RunnableParallel({
        "context": custom_retriever,
        "question": RunnablePassthrough()
    })
    | prompt
    | llm
    | StrOutputParser()
)


if __name__ == "__main__":
    query = "What are Pranav's main technical skills?"
    answer = rag_chain.invoke(query)
    print("Answer:\n", answer)


  initial_docs = self.base_retriever.get_relevant_documents(query)


AttributeError: 'dict' object has no attribute 'search'

In [28]:
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from typing import List
from pydantic import BaseModel
from sentence_transformers import CrossEncoder

from langchain.schema import Document, BaseRetriever
from langchain.docstore import InMemoryDocstore
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

import numpy as np
import faiss
import os 
os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
loader = TextLoader("../data/info.txt")
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
docs = text_splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
vectors = embeddings.embed_documents([d.page_content for d in docs])
dim = len(vectors[0])
num_docs = len(vectors)
if num_docs > 20:  
    nlist = min(10, num_docs // 2)  
    quantizer = faiss.IndexFlatL2(dim)
    index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_L2)

    index.train(np.array(vectors).astype("float32"))
    index.add(np.array(vectors).astype("float32"))

    index.nprobe = max(1, nlist // 2)
else:
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(vectors).astype("float32"))

docstore = InMemoryDocstore()
index_to_docstore_id = {}

for i, doc in enumerate(docs):
    doc_id = str(i)
    docstore.add({doc_id: doc})
    index_to_docstore_id[i] = doc_id

vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
)

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs)
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs

class RerankRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    top_k: int = 5

    def _get_relevant_documents(self, query: str) -> List[Document]:
        initial_docs = self.base_retriever.invoke(query)
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs[:self.top_k]

base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=3)

llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    streaming=True
)

prompt = PromptTemplate(
    template="""
You are a smart and knowledgeable AI assistant helping users understand 
the professional background, projects, skills, and certifications of Katta Sai Pranav Reddy.

Use the following context extracted from Pranav's profile and provide a clear, helpful, and detailed answer.

Context:
{context}

Question: {question}
Helpful Answer:""",
    input_variables=["context", "question"]
)

rag_chain = (
    RunnableParallel({
        "context": custom_retriever,
        "question": RunnablePassthrough()
    })
    | prompt
    | llm
    | StrOutputParser()
)

if __name__ == "__main__":
    query = "What are Pranav's main technical skills?"
    answer = rag_chain.invoke(query)
    print("Answer:\n", answer)


Answer:
 Pranav's main technical skills can be categorized into several areas:

1. **Tools:** He is proficient in using tools such as:
	* MLflow
	* DVC
	* Docker
	* Git
	* GitHub Actions
	* AWS (EC2, S3, ECR)
	* FAISS
	* Pinecone
	* Hugging Face
	* LangChain
	* LangSmith
	* FastAPI
2. **Programming & Technical Skills:** He has expertise in:
	* Python
	* SQL
	* HTML
	* CSS
	* Scikit-learn
	* TensorFlow
	* Keras
	* Statistics
3. **Data Science & Machine Learning:** He is skilled in:
	* Data Preprocessing
	* Exploratory Data Analysis (EDA)
	* Feature Engineering
	* Model Training & Evaluation
	* Hyperparameter Tuning
	* Clustering
	* MLOps
	* Semantic Search
	* Retrieval-Augmented Generation (RAG)
	* CNN
	* RNN
	* GPT
	* Transformers
	* Fine-Tuning
	* Prompt Engineering
4. **Data Visualization & Analysis:** He is proficient in:
	* Pandas
	* NumPy
	* Matplotlib
	* Seaborn

These technical skills demonstrate Pranav's strong foundation in data science, machine learning, and programming, maki

In [11]:
if __name__ == "__main__":
    query = "How Did Pranav Reddy Deployed His BigBasket SmartCart Application"
    answer = rag_chain.invoke(query)
    print("Answer:\n", answer)


Answer:
 Pranav Reddy deployed his BigBasket SmartCart application using a combination of containerization, automation, and cloud computing. Here's a step-by-step overview of his deployment process:

1. **Containerization using Docker**: Pranav containerized his application using Docker, which allowed him to package the application and its dependencies into a single container. This ensured that the application was isolated and portable, making it easy to deploy on different environments.
2. **Automated CI/CD using GitHub Actions**: Pranav used GitHub Actions to automate his Continuous Integration and Continuous Deployment (CI/CD) pipeline. He created a workflow file (.github/workflows/ci-cd.yml) that defined the build, test, and deployment process. The workflow was triggered on push events to the main branch or pull requests.
3. **Building and Pushing Docker Image**: As part of the CI/CD pipeline, GitHub Actions built the Docker image and pushed it to DockerHub (pranavreddy123/bigbaske

In [17]:
import pinecone
print(pinecone.__version__)


7.3.0


In [15]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="pcsk_5fcYNp_SaAxqoMdnTB1CTT7Cm4hadkAYywfDLj94e3WDpWsHuU92XZSZJ6J2AakvyBHnrh")
index_name = "developer-quickstart-py"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )


In [7]:
import os
from dotenv import load_dotenv
from pinecone import ServerlessSpec

from langchain_groq import ChatGroq
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone as PineconeVectorStore

from langchain.chains import RetrievalQA
from pinecone import Pinecone

load_dotenv()
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
index_name = "latest-portfolio1"  

llm = ChatGroq(model="llama-3.3-70b-versatile")

text_path = "../data/info.txt"
loader = TextLoader(text_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
print(f"Total Chunks: {len(docs)}")

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

if not PINECONE_API_KEY:
    raise ValueError("❌ Missing PINECONE_API_KEY in environment variables")

pc = Pinecone(api_key=PINECONE_API_KEY)




from pinecone import ServerlessSpec


if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)




from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

vector_store = PineconeVectorStore(index=index, embedding=embeddings)



Total Chunks: 62


In [8]:
from uuid import uuid4

from langchain_core.documents import Document

uuids = [str(uuid4()) for _ in range(len(docs))]

vector_store.add_documents(documents=docs, ids=uuids)


['bcf92a67-d961-4369-9c24-8c9ba79b927a',
 '60ff68ba-b880-44cd-9152-9476a4b7b8aa',
 'bb9853a9-f75d-48d1-a190-99851379d341',
 '117ef1ed-4552-44cf-b1d0-ba88f4fd021a',
 'b6165dae-763b-4549-b3a4-8a4361cebb4d',
 '9806238b-2e87-4be8-ba69-268bd541968b',
 '29c54d14-d027-4eb4-bec6-d57e40334f1e',
 '89deb20e-25af-41b8-ab04-263c4e51d4f9',
 '10626073-7590-4d78-8cd3-3c17108e52a1',
 '0d87b496-e327-4b6c-994a-00ffdfc1e472',
 '754587c7-d989-4211-8d16-b8855d5fa92e',
 '26d04d19-b806-4a60-870b-37a70aad66c1',
 'e89e3279-dc84-4357-8b73-110058b2c090',
 '88af8404-e669-4279-81bc-2083a39bd489',
 'a0e2895a-f054-4e00-b920-1d3c14333f3e',
 '4bcf0e29-a25a-4a6a-a721-32ced55bc5da',
 '7f5dad96-f46e-48f6-8b48-218bf8549e64',
 '28f0a4d3-6335-4c60-85ef-704f4412b6fe',
 '31cef1b8-f5bc-42f3-b055-8174e97f5f62',
 '37837cae-f9e4-4310-b5a2-5cae6cfcea0a',
 '1b1290b5-875a-4557-87b5-3003bdc20192',
 '314ac472-571d-4b51-b4b0-d1f6a9141852',
 '3c99d35e-c709-44be-8dca-778bb0bbe30e',
 '14450ced-fece-4a97-b6e0-9a9bdb62447d',
 '4b2df0e2-fe42-

In [9]:
vector_store

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x19a29505630>

In [10]:
docs

[Document(metadata={'source': '../data/info.txt', 'text': "10th class Marks\n\n**Board of Secondary Education\nTelangana State, India**\n\n**SECONDARY SCHOOL CERTIFICATE**\n**REGULAR** PC/29/4222/04/256517/3\n**TS-EE 524495**\n\n---\n\n**CERTIFIED THAT**\n**KATTA SAI PRANAV REDDY**\n**Father's Name:** KATTA SRINIVAS REDDY\n**Mother's Name:** KATTA UMARANI\n**Roll No.:** 1929100642\n**Date of Birth:** 03/06/2003 (Zero Three June Two Zero Zero Three)\n**School:** EKALAVYA FOUNDATION SCL NALGONDA, NALGONDA DISTRICT\n**Medium:** ENGLISH\n\nHas appeared and **PASSED SSC EXAMINATION** held in **MARCH–2019**\n\n\n### **The Candidate Secured the Following Grade and Grade Points in Curricular Areas:**"}, page_content="10th class Marks\n\n**Board of Secondary Education\nTelangana State, India**\n\n**SECONDARY SCHOOL CERTIFICATE**\n**REGULAR** PC/29/4222/04/256517/3\n**TS-EE 524495**\n\n---\n\n**CERTIFIED THAT**\n**KATTA SAI PRANAV REDDY**\n**Father's Name:** KATTA SRINIVAS REDDY\n**Mother's Name

In [None]:
from sentence_transformers import CrossEncoder

from langchain.schema import Document, BaseRetriever
from pydantic import BaseModel
from typing import List


reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs)
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs

class RerankRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    top_k: int = 5

    def _get_relevant_documents(self, query: str) -> List[Document]:
        initial_docs = self.base_retriever.get_relevant_documents(query)  # ✅ FIX
        if not initial_docs:
            return []
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs[:self.top_k]
base_retriever = vector_store.as_retriever(search_kwargs={"k": 10})
custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=5)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=custom_retriever,
    chain_type="stuff"  
)

query = "Give me the contact details"
response = qa_chain.run(query)

print("\n=== Answer ===")
print(response)


=== Answer ===
Here are the contact details:

📞 Phone: +91 93475 41040
📧 Email: [kattapranavreddy@gmail.com](mailto:kattapranavreddy@gmail.com)
💻 GitHub: [github.com/ka1817](https://github.com/ka1817)
🔗 LinkedIn: [linkedin.com/in/pranav-reddy-katta](https://www.linkedin.com/in/pranav-reddy-katta/)


In [22]:
from sentence_transformers import CrossEncoder
from langchain.schema import Document, BaseRetriever
from pydantic import BaseModel
from typing import List
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs)
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs

class RerankRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    top_k: int = 5

    def _get_relevant_documents(self, query: str) -> List[Document]:
        initial_docs = self.base_retriever.get_relevant_documents(query)
        if not initial_docs:
            return []
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs[:self.top_k]

base_retriever = vector_store.as_retriever(search_kwargs={"k": 10})
custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=5)

prompt_template = """
You are an expert assistant. Use the following context to answer the user's question.

Context:
{context}

Question:
{question}

Answer in a clear, structured, and detailed manner. If the context does not contain the answer,
say "I don't know based on the given context."
"""

custom_prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=custom_retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": custom_prompt}
)

query = "what certifications pranav reddy have done?"
response = qa_chain.run(query)

print("\n=== Answer ===")
print(response)




=== Answer ===
Pranav Reddy has completed the following certifications, all of which are from Udemy:

1. **Python for Data Science and Machine Learning**
2. **The Complete SQL Bootcamp**
3. **Generative AI with LangChain and HuggingFace**
4. **End-To-End MLOps Bootcamp**

These certifications demonstrate his interest and expertise in areas such as data science, machine learning, SQL, and artificial intelligence.


In [23]:
from sentence_transformers import CrossEncoder
from langchain.schema import Document, BaseRetriever
from pydantic import BaseModel
from typing import List
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough
from langchain.schema import StrOutputParser

# ✅ Reranker
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs)
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs

class RerankRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    top_k: int = 5

    def _get_relevant_documents(self, query: str) -> List[Document]:
        initial_docs = self.base_retriever.get_relevant_documents(query)
        if not initial_docs:
            return []
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs[:self.top_k]

# ✅ Build reranking retriever
base_retriever = vector_store.as_retriever(search_kwargs={"k": 10})
custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=5)

# ✅ Custom Prompt
prompt_template = """
You are an expert assistant. Use the following context to answer the user's question.

Context:
{context}

Question:
{question}

Answer in a clear, structured, and detailed manner. If the context does not contain the answer,
say "I don't know based on the given context."
"""
custom_prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# ✅ LCEL RAG pipeline
rag_chain = (
    RunnableParallel({
        "context": custom_retriever,   # returns reranked docs
        "question": RunnablePassthrough()
    })
    | custom_prompt
    | llm
    | StrOutputParser()
)

# ✅ Ask question
query = "what certifications pranav reddy have done?"
response = rag_chain.invoke(query)

print("\n=== Answer ===")
print(response)



=== Answer ===
Based on the given context, Pranav Reddy has completed the following certifications, all of which are from Udemy:

1. **Python for Data Science and Machine Learning**
2. **The Complete SQL Bootcamp**
3. **Generative AI with LangChain and HuggingFace**
4. **End-To-End MLOps Bootcamp**

These certifications are mentioned in multiple documents within the context, indicating that they are a part of Pranav Reddy's educational background and skills.


In [32]:
len(embeddings.embed_query("who is kohli"))

768

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

if not PINECONE_API_KEY:
    raise ValueError("❌ Missing PINECONE_API_KEY in environment variables")

pc = Pinecone(api_key=PINECONE_API_KEY)
