In [None]:
print("Hey")

In [None]:
%pwd

In [None]:
import os
os.chdir("../")

In [None]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
#Extracting the data from pdf

def load_pdf_file(data):
    loader=DirectoryLoader(data,
                           glob="*.pdf",
                           loader_cls=PyPDFLoader)
    

    documents=loader.load()
    return documents
    

In [None]:
extracted_data=load_pdf_file(data='Data/')

In [None]:
#extracted_data

In [None]:
#split the data into text chunks

def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
text_chunks=text_split(extracted_data)
print('Length of the text chunks is : ',len(text_chunks))

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [None]:
#Download the embeddings from HugginFace

def download_hugging_face_embedings():
    embeddings=HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [None]:
embeddings=download_hugging_face_embedings()

In [None]:
query_result=embeddings.embed_query("Hello world")
print('Lenth ',len(query_result))

In [None]:
# query_result

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GOOGLE_API_KEY=os.environ.get('GOOGLE_API_KEY')

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [None]:
import os
os.environ["PINECONE_API_KEY"]=PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"]=GOOGLE_API_KEY

In [None]:
# Embed each chunk and upsert the embeddings to Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [None]:
#Load existing index

from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    
    index_name=index_name,
    embedding=embeddings, 
)

In [None]:
docsearch

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":5})

In [None]:
retrived_docs=retriever.invoke("What is Acne?")

In [None]:
retrived_docs

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=os.environ["GOOGLE_API_KEY"],
    temperature=0.4,
    max_output_tokens=500
)


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [None]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response=rag_chain.invoke({"input":"What is Colonic irrigation?"})
print(response["answer"])