In [8]:
%pwd

'c:\\Users\\Jaisal\\OneDrive\\Desktop\\New_me\\Langchain_Projects\\Medical-Chatbot\\research'

In [9]:
import os
os.chdir('../')

In [10]:
%pwd

'c:\\Users\\Jaisal\\OneDrive\\Desktop\\New_me\\Langchain_Projects\\Medical-Chatbot'

In [11]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Extract the data from the pdf file

In [12]:
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                              glob="*.pdf",
                              loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [13]:
extracted_data = load_pdf_file(data = 'data/')

In [14]:

# Split the documents into smaller chunks

def text_splitter(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [15]:
text_chunks = text_splitter(extracted_data)
print(f"Total number of chunks: {len(text_chunks)}")

Total number of chunks: 5699


In [16]:
# Download the embeddings from Hugging Face
from langchain.embeddings import HuggingFaceEmbeddings


In [17]:

def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    return embeddings

In [18]:

embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [19]:
query = embeddings.embed_query("What is the capital of France?")
print(f"length of the query: {len(query)}")

length of the query: 384


In [53]:
from dotenv import load_dotenv
load_dotenv()

True

In [21]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

In [23]:

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "med-db"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "med-db",
    "metric": "cosine",
    "host": "med-db-6mk6t8w.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [24]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [25]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [26]:
# Loading the existing index

from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)

In [27]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [28]:
retrieved_docs = retriever.invoke("What is types of diabeties?")

## Checking whether our query can be answered by retrieving



In [29]:
retrieved_docs

[Document(id='19269873-394b-4621-9cdb-606740ad23d1', metadata={'author': 'Clifford', 'creationdate': '2004-12-28T15:38:25-05:00', 'creator': 'PyPDF', 'enhanced': 'By PDF Enhancer 2.5/Win', 'moddate': '2005-05-04T13:53:15-06:00', 'page': 47.0, 'page_label': '48', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\Med_book.pdf', 'spdf': '1096', 'total_pages': 599.0}, page_content='stop responding to the insulin that is produced, so that\nglucose in the blood cannot be absorbed into the cells of\nthe body . Symptoms include frequent urination, tired-\nness, excessive thirst, and hunger.\nDescription\nDiabetes mellitus is a chronic disease that causes se-\nrious health complications including renal (kidney) fail-\nure, heart disease , stroke, and blindness. Approximate-\nly 14 million Americans (about 5% of the population)\nhave diabetes. Unfortunately , as many as one-half of'),
 Document(id='4c95e532-4cd0-4898-b3bb-731d9a791900', metadata={'author': 'Clifford', 'creationdate': '200

The answer is retrieved, but we need to make it more readable form for the user

## Initializing the LLM

In [54]:
import os
HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline

model_id = "mistralai/Mistral-7B-Instruct-v0.3"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

# Create pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    device_map="auto"  # Uses GPU if available
)

In [32]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are a helpful assistant that provides information about diabetes. "
    "You will be provided with some context and you need to answer the question based on the context.Use three sentences maximum and keep the answer short. "
    "If the context does not contain the answer, say 'I don't know'. "
    "If the context is too long, summarize it in three sentences maximum. "
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [33]:
combine_docs_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
rag_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [None]:
response = rag_chain.invoke({"input": "What are the types of diabetes?"})
print(response)