In [None]:
# print("Okay!")

In [None]:
%pwd

In [None]:
import os
os.chdir("../")
%pwd

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Extract Data from the MedBook PDF File

In [None]:
def loadpdf_file(data):
    loader= DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)
    docs= loader.load()
    
    return docs

In [None]:
extracted_data= loadpdf_file(data='Data/')

In [None]:
# extracted_data

# Split the Data into Text Chunks

In [None]:
def txt_split(extracted_data):
    text_splitter= RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks= text_splitter.split_documents(extracted_data)
    return text_chunks


In [None]:
text_chunks= txt_split(extracted_data)
print("Lenght of Text Chunks: ", len(text_chunks))

In [None]:
# text_chunks

# Download the Embeddings from Hugging Face

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
def download_hug_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [None]:
embeddings = download_hug_face_embeddings()

In [None]:
query_res= embeddings.embed_query("Hello World")
print("Length: ", len(query_res))
# print("Vector: ", query_res)

# Initialize PineCone Vector Database

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
PINECONE_APIKEY= os.environ.get("PINECONE_APIKEY")
OPENAI_APIKEY= os.environ.get("OPENAI_APIKEY")

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_APIKEY)

index_name = "medicalbot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [None]:
import os
os.environ["PINECONE_APIKEY"] = PINECONE_APIKEY 
os.environ["OPENAI_APIKEY"] = OPENAI_APIKEY

In [None]:
#Embed each chunk and upsert the embeddings into your PineCone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    docs=text_chunks,
    index_name=index_name,
    embedding=embeddings,
    )

# Load the existing index

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [None]:
#Object
docsearch

In [None]:
retriever = docsearch.as_retriever(search_type="similarity" , search_kwargs={"k":3})

In [None]:
retrieved_data=retriever.invoke("What is Acne?")

In [None]:
#Answer 
retrieved_data

# Initializing OpenAI Model

In [None]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4 , max_tokens=500)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrieved content to answer"
    "the questions . If you don't know the answer , say that you"
    "don't know .  Use three sentences maximum and keep the"
    "answers concise ."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human","{input}")
    ]
)

In [None]:
question_answering_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [None]:
response = rag_chain.invoke({"input": "What is Acne?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "What is Operating System?"})
print(response["answer"])