In [13]:
import os
os.chdir("../")

In [14]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [15]:
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [None]:
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5860


In [None]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [11]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [12]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [17]:
from dotenv import load_dotenv
load_dotenv()

False

In [18]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
)

{
    "name": "medbot",
    "metric": "cosine",
    "host": "medbot-u4zotnl.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [19]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [20]:
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [21]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x251aab77c10>

In [22]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [23]:
retrieved_docs = retriever.invoke("What is Acne?")

In [24]:
retrieved_docs

[Document(id='927c9302-11f7-47eb-a555-22da7f7dab8a', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='f5810818-af40-459d-9456-01c4719a01e0', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM -

In [29]:
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

In [30]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [50]:
import os
from dotenv import load_dotenv
load_dotenv()
HUGGINGFACE_API_KEY = os.environ.get('HUGGINGFACE_API_KEY')
os.environ["HUGGINGFACE_API_KEY"] = HUGGINGFACE_API_KEY


In [27]:
from langchain_huggingface import ChatHuggingFace,HuggingFaceEndpoint

In [44]:
from langchain.chat_models import ChatHuggingFace
from langchain_community.llms import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id='mistralai/Mistral-7B-Instruct-v0.3',
    task='text-generation',
    huggingfacehub_api_token=hf_api_key
)

chat_model = ChatHuggingFace(llm=llm)
response = chat_model.invoke("What is the capital of India?")
print(response.content)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


The capital of India is New Delhi. It is important to note that New Delhi is not a state but a city-union territory, administered directly by the Central government of India. The government and most embassies are located in New Delhi. India also has a parliamentary system of government, with the President as the head of state and the Prime Minister as the head of government. The President's residence is in New Delhi as well.


In [46]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [47]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [48]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])




Acromegaly is a disorder characterized by the abnormal
growth of bone and soft tissue due to the excess secretion
of growth hormone (GH) and insulin-like growth factor
(IGF-1) by the pituitary gland in the brain. In rare cases,
Acromegaly may be caused by a tumor in the hypothalamus
or a tumor on the adrenal or pancreatic glands. Symp-
toms of Acromegaly can include enlarged hands and feet,
thickening of the skin, and a deepening of the voice.

In some cases, acromegaly can also lead to excessive
growth in height (gigantism), but this only occurs when
the disorder develops before the growth plates in the bones
have closed, typically before the age of 20. If Acromegaly
develops after the growth plates have closed, the disorder
is called Acromegaly. Common symptoms of Acromegaly
include enlargement of the jaw, brow, nose, and ears;
thickening of the skin; and joint pain and stiffness.


In [49]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])




Assistant: In the given context, "stats" is not mentioned. However, if you are referring to "statistics", it is not explicitly mentioned either, but the context does mention a "complete blood count" (CBC), which is a type of clinical laboratory test that involves statistical analysis of the numbers, concentrations, and conditions of different types of blood cells.
