In [1]:
!pwd

/f/python_venv/Medical-Chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS/research


In [2]:
import os
os.chdir("../")
!pwd

/f/python_venv/Medical-Chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS


In [3]:
# load the pdf data
from langchain.document_loaders import PyPDFLoader, DirectoryLoader


In [4]:
# extract text from pdf files.
def load_pdf_files(pdf_path):
    loader = DirectoryLoader(
        path=pdf_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    # path: Path to directory.
    # glob: A glob pattern or list of glob patterns to use to find files. Defaults to "**/[!.]*" (all files except hidden).
    # loader_cls: Loader class to use for loading files. Defaults to UnstructuredFileLoader.

    documents = loader.load()
    return documents


In [5]:
extracted_pdf_data = load_pdf_files("data")

In [6]:
type(extracted_pdf_data)

list

In [7]:
extracted_pdf_data[:3]

[Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data\\Medical_book.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data\\Medical_book.pdf', 'total_pages': 637, 'page': 1, 'page_label': '2'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data\\Medical_book.pdf', 'total_pages': 637, 'page': 2, 'page_label': '3'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1')]

In [8]:
len(extracted_pdf_data)

637

In [9]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """

    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs



In [10]:
minimal_docs = filter_to_minimal_docs(extracted_pdf_data)
print(len(minimal_docs))
minimal_docs[:2]

637


[Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content=''),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION')]

In [11]:
minimal_docs[2:10]

[Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow,Manager, Imaging and Multimedia\nContent\nRobyn V . Young,Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and\nMultimedia Content\nKelly A. Quin, Editor, Imaging and Multimedia Content\nLeitha Etheridge-Sims, Mary K. Grimes, Dave Oblender,\nImage Catalogers\nPamela A. Reed, Imaging Coordinator\nRandy Bassett, Imaging S

In [12]:
# split the documents into smaller chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, # consider 500 characters as 1 chunk.
        chunk_overlap=20, # overlap 20 characters for each chunk.
    )
    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

In [13]:
text_chunks = text_split(minimal_docs=minimal_docs)
print(f"Number of chunks: {len(text_chunks)}")

Number of chunks: 5859


In [14]:
text_chunks[:10]

[Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow,Manager, Imaging and Multimedia\nContent\nRobyn V . Young,Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='Multimedia

In [15]:
# get the embeddings model.
# Embedding models are machine learning models that transform data 
# (like text, images, or audio) into numerical representations called embeddings. 
# These embeddings capture the semantic meaning and relationships within the data, 
# allowing machines to understand and compare them in a meaningful way.

from langchain.embeddings import HuggingFaceEmbeddings

def download_mebeddings():
    """
    Download and return the HuggingFace embeddings model.

    Here, I am using the below embedding model:
    all-MiniLM-L6-v2
    This is a sentence-transformers model: It maps sentences & paragraphs to a 
    384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding_model = download_mebeddings()


  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [17]:
"""
embed_query()
Compute query embeddings using a HuggingFace transformer model.

Args:
    text: The text to embed.

Returns
    out: Embeddings for the text.
"""
vector = embedding_model.embed_query("Hello World")
vector

[-0.03447727486491203,
 0.03102312609553337,
 0.006734980270266533,
 0.026108933612704277,
 -0.03936205804347992,
 -0.16030246019363403,
 0.06692394614219666,
 -0.006441438104957342,
 -0.047450482845306396,
 0.014758863486349583,
 0.07087534666061401,
 0.05552757531404495,
 0.019193356856703758,
 -0.02625126577913761,
 -0.01010954286903143,
 -0.026940442621707916,
 0.022307462990283966,
 -0.02222665585577488,
 -0.14969263970851898,
 -0.017493024468421936,
 0.007676282897591591,
 0.054352231323719025,
 0.0032544038258492947,
 0.03172588348388672,
 -0.08462139964103699,
 -0.029405992478132248,
 0.051595550030469894,
 0.048124078661203384,
 -0.003314835485070944,
 -0.05827915295958519,
 0.04196925833821297,
 0.022210702300071716,
 0.1281888633966446,
 -0.022338951006531715,
 -0.011656239628791809,
 0.06292837113142014,
 -0.03287634998559952,
 -0.09122604131698608,
 -0.03117534890770912,
 0.052699536085128784,
 0.04703483358025551,
 -0.08420310169458389,
 -0.030056182295084,
 -0.0207448396

In [18]:
print(len(vector))

384


In [19]:
# store the vectors to the Pinecone vector database.

In [20]:
# load the Pinecone API key
from dotenv import load_dotenv
import os
load_dotenv()


True

In [21]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [77]:
PINECONE_API_KEY

'pcsk_4VCHee_RA4Q1NuweS6ZiPSK8Y8PmafswH3AQrR7R2qdsUesgy1jmNZaqfKNLR43YgMm3Q5'

In [22]:
OPENAI_API_KEY

'x6UpvGAsWY73ZM2rg1dbFh1cggl1X67sAAygowbF'

In [23]:
# create an index in Pinecone.
from pinecone import Pinecone

# initialize a pinecone client with your API key.
pc = Pinecone(api_key=PINECONE_API_KEY)


In [26]:
# create a database 
# Create a dense index with integrated embedding
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

In [None]:


"""
create_index(): Creates a Pinecone index.

Parameters
    name : str
        The name of the index to create. Must be unique within your project and cannot be 
        changed once created. Allowed characters are lowercase letters, numbers, and 
        hyphens and the name may not begin or end with hyphens. Maximum length is 45 
        characters.
    metric : str, optional
        Type of similarity metric used in the vector index when querying, one of 
        {"cosine", "dotproduct", "euclidean"}.
    spec : Dict
        A dictionary containing configurations describing how the index should be deployed. 
        For serverless indexes, specify region and cloud. For pod indexes, specify replicas, 
        shards, pods, pod_type, metadata_config, and source_collection. Alternatively, use 
        the ServerlessSpec, PodSpec, or ByocSpec objects to specify these configurations.
    dimension : int
        If you are creating an index with vector_type="dense" (which is the default), you 
        need to specify dimension to indicate the size of your vectors. This should match 
        the dimension of the embeddings you will be inserting. For example, if you are 
        using OpenAI's CLIP model, you should use dimension=1536. Dimension is a 
        required field when creating an index with vector_type="dense" and should not 
        be passed when vector_type="sparse".
"""
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        metric="cosine", # consine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        dimension=384 # dimension of the embeddings
    )

index = pc.Index(index_name)

In [None]:
# store the vector.
from langchain_pinecone import PineconeVectorStore

# take all of the text chunks, use the embedding model to convert the chunks 
# into vector embedding, and store it in Pinecone vector database.
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks, 
    embedding=embedding_model, 
    index_name=index_name
)

In [27]:
from langchain_pinecone import PineconeVectorStore

# Once we are done storing the vectors in pinecone vector database, 
# we usually want to:
#     Run similarity search
#     Query existing data
#     Use retrieval for a chatbot or RAG pipeline
# To do that, we do not need to recreate the index from documents — 
# instead, we just connect to it using from_existing_index()

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding_model
)

In [None]:
# # if we want to add more data to the existing pinecone index,
# # another pdf, etc:

# extra_document = Document(
#     page_content="karansinh padhiar is a passionate data scientist.",
#     metadata = {"source": "Personal"}
# )
# docsearch.add_documents(documents=[extra_document])
# # it adds the above content to the existing vector database and 
# # returns the index of that document.
# # index - also called key - unique id for each document (sentence).

['e04deacd-1046-4080-becf-9d0054a65810']

In [28]:
# now, we have stored everything in our knowledge: pinenone vector database.
# now, we will create a retriever, and connect the LLM.
retriever = docsearch.as_retriever(search_type="similarity", 
                                   search_kwargs={"k" : 3}) # get 3 most similar responses from the vector dataset.


In [29]:
# Invoke the retriever to get relevant documents.
retrieved_docs = retriever.invoke(input="What is Acne?")
# it will return 3 most similar responses
retrieved_docs

[Document(id='5b245592-fc55-4e5f-ab4e-0672d89b9215', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='e8324649-ad85-499c-ba1d-1efc0b6fc362', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='c5902bc6-2e29-4a94-8cfe-ef7c61360876', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged wi

In [None]:
# # connect the LLM
# from langchain_openai import ChatOpenAI
# chat_model = ChatOpenAI(model="@cf/openai/gpt-oss-120b")

from langchain_core.language_models import LLM
from typing import Optional, List
import requests
import json

class CloudflareLLM(LLM):
    cloudflare_user_id: str
    api_key: str
    model: str = "@cf/openai/gpt-oss-120b"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        url = f"https://api.cloudflare.com/client/v4/accounts/{self.cloudflare_user_id}/ai/run/{self.model}"
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        payload = json.dumps({
            "input": prompt
        })
        response = requests.post(url, headers=headers, data=payload)
        result = response.json()

        if not result["success"]:
            raise Exception(result["errors"])
        
        return result["result"]["output"][-1]["content"][0]["text"]  # This depends on Cloudflare's exact response structure

    @property
    def _llm_type(self) -> str:
        return "cloudflare_llm"

llm = CloudflareLLM(
    cloudflare_user_id="1e68d54f3b2a5e5c8b567a3574827073",
    api_key=OPENAI_API_KEY,
    model="@cf/openai/gpt-oss-120b"
)

response = llm.invoke("Who is Elon Musk?")
print(response)

Elon Musk is a South‑African‑born entrepreneur and engineer who has become one of the most prominent figures in technology and business. Here are some key points about him:

| Category | Details |
|----------|---------|
| **Full Name** | Elon Reeve Musk |
| **Birth** | June 28 1971, Pretoria, South Africa |
| **Citizenship** | South African (by birth), Canadian (through his mother), and U.S. (naturalized) |
| **Education** | Attended the University of Pretoria (briefly), Queen’s University (Ontario), then transferred to the University of Pennsylvania, where he earned a B.S. in Physics and a B.S. in Economics (Wharton). He also enrolled in a Ph.D. program at Stanford for applied physics but left after two days to pursue business ventures. |
| **Major Companies** | • **Zip2** (1996‑1999) – early online city guide, sold to Compaq for ~$300 M<br>• **X.com / PayPal** (1999‑2002) – online payment system, sold to eBay for $1.5 B<br>• **SpaceX** (2002‑present) – private aerospace manufacturer 

In [31]:
# create a langchain chain.
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are a medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved contect to answer "
    "the question. If you do not know the answer, say that you "
    "do not know. Use three sentences maximum and keep the "
    "answer concise, meaningful, useful and understandable."
    "\n\n"
    "{context}"
)

# Create a chat prompt template from a variety of message formats.
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)


In [32]:
# Create a chain for passing a list of Documents to a model.
question_answer_chain = create_stuff_documents_chain(llm=llm, 
                                                     prompt=prompt)

rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [33]:
# now the chain is created.
# now, we can ask any kinds of questions.
response = rag_chain.invoke({"input" : "who is elon musk?"})
print(response["answer"])


I do not know.
