In [8]:
%pwd

'c:\\Users\\patel\\OneDrive\\Desktop\\Projects\\CareBot\\research'

In [9]:
import os
os.chdir('../')
%pwd

'c:\\Users\\patel\\OneDrive\\Desktop\\Projects\\CareBot'

In [10]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

Extracting the Data (Gale Encyclopedia of Medicine).

In [11]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                              glob='*.pdf',
                              loader_cls = PyPDFLoader)
    documents = loader.load()
    return documents

In [12]:
extracted_data = load_pdf(data = 'data/')

Chunking the data

In [13]:
def text_chunking(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [14]:
text_chunks = text_chunking(extracted_data)
print('Length of the text chunks:', len(text_chunks))

Length of the text chunks: 5859


In [15]:
from langchain.embeddings import HuggingFaceEmbeddings

def donwload_huggingface_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name= 'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [16]:
embeddings = donwload_huggingface_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name= 'sentence-transformers/all-MiniLM-L6-v2')


In [17]:
query_result = embeddings.embed_query("Hello World!")
print('length :' , len(query_result))


length : 384


Creating Indexes using pinecone

In [18]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
import os

load_dotenv()
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')


pc = Pinecone(api_key = PINECONE_API_KEY)

index_name = "carebot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric = 'cosine',
        spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
        )
    )

In [19]:
import os 
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

Creating Vector Store

In [20]:
from langchain_pinecone import PineconeVectorStore

vs = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
    
)

Loading the Vector Database 

In [21]:
vs = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)
vs

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1f9a98d46d0>

In [22]:
retriever = vs.as_retriever(search_type='similarity', search_kwargs={'k':3})

In [23]:
docs = retriever.invoke('What is back Acne?')
docs

[Document(id='86e68a25-4567-4f62-8079-34a06734c78b', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='b2195c5e-442e-4343-9cc3-9c5f2f3e53a8', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 37.0, 'page_label': '38', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteri

In [24]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for the question answer tasks. Use the following pieces of retrieved context to answer the question. If you don't " \
    "know the asnwer, say that you don't know. Use three sentences maximum and keep the answer concise. \n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        ('human', '{input}')
    ]
)

In [25]:
from langchain_community.llms import Ollama
llm = Ollama(model = 'mistral')

In [29]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
print(question_answer_chain)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for the question answer tasks. Use the following pieces of retrieved context to answer the question. If you don't know the asnwer, say that you don't know. Use three sentences maximum and keep the answer concise. \n\n{context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])
| Ollama(model='mistral')
| StrOutputParser() kwargs={} config={'run_name': 'stuff_documents_chain'} config_factories=[]


In [27]:
response = rag_chain.invoke({'input': 'What is Acne'})
print(response['answer'])

 Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. Acne vulgaris, also known as common acne, is the most prevalent form and affects nearly 17 million people in the U.S.
