In [1]:
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from typing import List
from pydantic import BaseModel
from sentence_transformers import CrossEncoder

from langchain.schema import Document, BaseRetriever

from langchain_core.runnables import RunnableParallel, RunnablePassthrough

import os 
os.environ['GROQ_API_KEY']=os.getenv("GROQ_API_KEY")
loader = TextLoader("../data/info.txt")
documents = loader.load()



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
docs = text_splitter.split_documents(documents)



In [3]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
vectorstore = FAISS.from_documents(docs, embeddings)





  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")


In [4]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs)
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs

class RerankRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    top_k: int = 5

    def _get_relevant_documents(self, query: str) -> List[Document]:
        initial_docs = self.base_retriever.invoke(query)
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs[:self.top_k]

base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=3)


In [5]:
llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",streaming=True
)

from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    template="""
You are a smart and knowledgeable AI assistant helping users understand the professional background, projects, skills, and certifications of Katta Sai Pranav Reddy.

Use the following context extracted from Pranav's profile and provide a clear, helpful, and detailed answer.

Context:
{context}

Question: {question}
Helpful Answer:""",
    input_variables=["context", "question"]
)
rag_chain = (
    RunnableParallel({
        "context": custom_retriever,
        "question": RunnablePassthrough()
    })
    | prompt
    | llm
    | StrOutputParser()
)




In [7]:
# --- 8. Run with streaming ---
query = "Give me contact Info"

print("Answer (streaming):")
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)


Answer (streaming):
To get in touch with Katta Sai Pranav Reddy, you can use the following contact information:

* Phone: +91 93475 41040
* Email: [kattapranavreddy@gmail.com](mailto:kattapranavreddy@gmail.com)
* GitHub: [github.com/ka1817](https://github.com/ka1817)
* LinkedIn: [linkedin.com/in/pranav-reddy-katta](https://www.linkedin.com/in/pranav-reddy-katta/)

Feel free to reach out to him through any of these channels.

In [10]:
query = "Give me the github link of pranav reddy"

print("Answer (streaming):")
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)


Answer (streaming):
The GitHub link of Pranav Reddy is not explicitly provided, but the GitHub repository for his project, "BigBasket-SmartCart-AI-Assistant-for-BigBasket-Shopping", can be accessed through the link: 🔍 GitHub Repo: BigBasket-SmartCart-AI-Assistant-for-BigBasket-Shopping. 

However, a more direct link is not provided in the context. Nevertheless, another GitHub link for his personal project, "Netflix Customer Churn Prediction – End-to-End ML System", is mentioned as *[GitHub]*, but the actual link is not provided. 

If you are looking to access Pranav Reddy's GitHub profile or repositories, you may need to search for his username, which could be "pranavreddy123" based on his DockerHub repository (🐳 DockerHub: pranavreddy123/bigbasket-assistant).

In [4]:
query = "How He Deployed Netflix Churn Prediction Project"

print("Answer (streaming):")
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)


Answer (streaming):
Katta Sai Pranav Reddy deployed the Netflix Customer Churn Prediction project using a containerized approach with Docker. The project utilizes a production-grade, explainable, and reproducible Machine Learning (ML) pipeline that incorporates various tools and technologies for efficient deployment.

Here's an overview of the deployment process:

1. **CI/CD**: The project implements Continuous Integration/Continuous Deployment (CI/CD) to ensure seamless and automated testing, building, and deployment of the ML model.
2. **Experiment Tracking (MLflow)**: MLflow is used to track experiments, manage models, and monitor performance. This allows for easy comparison of different models, hyperparameters, and experiment results.
3. **Data Versioning (DVC)**: DVC is used for data versioning, which enables the tracking of changes to the data and ensures reproducibility of the results.
4. **Docker**: The project is containerized using Docker, which provides a lightweight and por

In [9]:
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from typing import List
from pydantic import BaseModel
from sentence_transformers import CrossEncoder

from langchain.schema import Document, BaseRetriever
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

import faiss
import numpy as np
import os

# --- 0. API Key ---
if not os.getenv("GROQ_API_KEY"):
    raise ValueError("Please set GROQ_API_KEY in your environment")


# --- 1. Load & Split Documents ---
loader = TextLoader("../data/info.txt")
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
docs = text_splitter.split_documents(documents)
texts = [d.page_content for d in docs]


# --- 2. Embeddings ---
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
vectors = embeddings.embed_documents(texts)  # list of embeddings
dim = len(vectors[0])


# --- 3. Build FAISS IVF Index ---
nlist = 5   # number of clusters
quantizer = faiss.IndexFlatL2(dim)
index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_L2)

# IVF requires training
index.train(np.array(vectors).astype("float32"))

# Add vectors
index.add(np.array(vectors).astype("float32"))

# Default search: probe only a few clusters
index.nprobe = 10


# --- 4. Wrap IVF Index into LangChain ---
from langchain.vectorstores.faiss import FAISS

# Map documents to their vector IDs
docstore = {}
index_to_docstore_id = {}
for i, doc in enumerate(docs):
    doc_id = str(i)
    docstore[doc_id] = doc
    index_to_docstore_id[i] = doc_id

vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
)


# --- 5. CrossEncoder Reranker ---
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs).tolist()
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs


# --- 6. Custom Reranker Retriever ---
class RerankRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    top_k: int = 5

    class Config:
        arbitrary_types_allowed = True  # allow FAISS retriever

    def _get_relevant_documents(self, query: str) -> List[Document]:
        initial_docs = self.base_retriever.get_relevant_documents(query)
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs[:self.top_k]


base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=3)


# --- 7. LLM ---
llm = ChatGroq(model_name="llama-3.3-70b-versatile", streaming=True)


# --- 8. Prompt ---
prompt = PromptTemplate(
    template="""
You are a smart and knowledgeable AI assistant helping users understand the professional background, projects, skills, and certifications of Katta Sai Pranav Reddy.

Use the following context extracted from Pranav's profile and provide a clear, helpful, and detailed answer.

Context:
{context}

Question: {question}

Helpful Answer:""",
    input_variables=["context", "question"]
)


# --- 9. RAG Chain ---
rag_chain = (
    RunnableParallel({
        "context": custom_retriever,
        "question": RunnablePassthrough()
    })
    | prompt
    | llm
    | StrOutputParser()
)


# --- 🔹 Example Usage ---
if __name__ == "__main__":
    query = "What are Pranav's main technical skills?"
    answer = rag_chain.invoke(query)
    print("Answer:\n", answer)


  initial_docs = self.base_retriever.get_relevant_documents(query)


AttributeError: 'dict' object has no attribute 'search'

In [28]:
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from typing import List
from pydantic import BaseModel
from sentence_transformers import CrossEncoder

from langchain.schema import Document, BaseRetriever
from langchain.docstore import InMemoryDocstore
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

import numpy as np
import faiss
import os 
os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
loader = TextLoader("../data/info.txt")
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
docs = text_splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
vectors = embeddings.embed_documents([d.page_content for d in docs])
dim = len(vectors[0])
num_docs = len(vectors)
if num_docs > 20:  
    nlist = min(10, num_docs // 2)  
    quantizer = faiss.IndexFlatL2(dim)
    index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_L2)

    index.train(np.array(vectors).astype("float32"))
    index.add(np.array(vectors).astype("float32"))

    index.nprobe = max(1, nlist // 2)
else:
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(vectors).astype("float32"))

docstore = InMemoryDocstore()
index_to_docstore_id = {}

for i, doc in enumerate(docs):
    doc_id = str(i)
    docstore.add({doc_id: doc})
    index_to_docstore_id[i] = doc_id

vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
)

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs)
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs

class RerankRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    top_k: int = 5

    def _get_relevant_documents(self, query: str) -> List[Document]:
        initial_docs = self.base_retriever.invoke(query)
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs[:self.top_k]

base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=3)

llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    streaming=True
)

prompt = PromptTemplate(
    template="""
You are a smart and knowledgeable AI assistant helping users understand 
the professional background, projects, skills, and certifications of Katta Sai Pranav Reddy.

Use the following context extracted from Pranav's profile and provide a clear, helpful, and detailed answer.

Context:
{context}

Question: {question}
Helpful Answer:""",
    input_variables=["context", "question"]
)

rag_chain = (
    RunnableParallel({
        "context": custom_retriever,
        "question": RunnablePassthrough()
    })
    | prompt
    | llm
    | StrOutputParser()
)

if __name__ == "__main__":
    query = "What are Pranav's main technical skills?"
    answer = rag_chain.invoke(query)
    print("Answer:\n", answer)


Answer:
 Pranav's main technical skills can be categorized into several areas:

1. **Tools:** He is proficient in using tools such as:
	* MLflow
	* DVC
	* Docker
	* Git
	* GitHub Actions
	* AWS (EC2, S3, ECR)
	* FAISS
	* Pinecone
	* Hugging Face
	* LangChain
	* LangSmith
	* FastAPI
2. **Programming & Technical Skills:** He has expertise in:
	* Python
	* SQL
	* HTML
	* CSS
	* Scikit-learn
	* TensorFlow
	* Keras
	* Statistics
3. **Data Science & Machine Learning:** He is skilled in:
	* Data Preprocessing
	* Exploratory Data Analysis (EDA)
	* Feature Engineering
	* Model Training & Evaluation
	* Hyperparameter Tuning
	* Clustering
	* MLOps
	* Semantic Search
	* Retrieval-Augmented Generation (RAG)
	* CNN
	* RNN
	* GPT
	* Transformers
	* Fine-Tuning
	* Prompt Engineering
4. **Data Visualization & Analysis:** He is proficient in:
	* Pandas
	* NumPy
	* Matplotlib
	* Seaborn

These technical skills demonstrate Pranav's strong foundation in data science, machine learning, and programming, maki

In [11]:
if __name__ == "__main__":
    query = "How Did Pranav Reddy Deployed His BigBasket SmartCart Application"
    answer = rag_chain.invoke(query)
    print("Answer:\n", answer)


Answer:
 Pranav Reddy deployed his BigBasket SmartCart application using a combination of containerization, automation, and cloud computing. Here's a step-by-step overview of his deployment process:

1. **Containerization using Docker**: Pranav containerized his application using Docker, which allowed him to package the application and its dependencies into a single container. This ensured that the application was isolated and portable, making it easy to deploy on different environments.
2. **Automated CI/CD using GitHub Actions**: Pranav used GitHub Actions to automate his Continuous Integration and Continuous Deployment (CI/CD) pipeline. He created a workflow file (.github/workflows/ci-cd.yml) that defined the build, test, and deployment process. The workflow was triggered on push events to the main branch or pull requests.
3. **Building and Pushing Docker Image**: As part of the CI/CD pipeline, GitHub Actions built the Docker image and pushed it to DockerHub (pranavreddy123/bigbaske