#NHS Healthcare Assistant

##Load Packages


In [None]:
!pip install langchain pypdf sentence-transformers ctransformers chromadb -q

##Data Loading & Processing
We will create a directory (`docs`) where we will load all the documents.

In [None]:
!mkdir docs

mkdir: cannot create directory ‘docs’: File exists


Now upload all the files to this directory

In [None]:
!pip install wget -q

The below code allows the user to import data from a github reporsitory or google drive link, which is optional.

In [98]:
import wget
import os
import requests

def get_github_files(repo_owner, repo_name, directory_path):
  """
  Fetches a list of PDF files from a GitHub repository directory.

  Args:
      repo_owner (str): The owner of the GitHub repository.
      repo_name (str): The name of the GitHub repository.
      directory_path (str): The path to the directory within the repository.

  Returns:
      list: A list of file URLs for the PDF files in the directory.
  """
  api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{directory_path}"
  headers = {"Accept": "application/vnd.github+json"}  # For the latest API version
  response = requests.get(api_url, headers=headers)
  response.raise_for_status()  # Raise an exception for bad status codes

  pdf_files = []
  for file_data in response.json():
      if file_data["type"] == "file" and file_data["name"].endswith(".pdf"):
          # Use file_data['path'] to construct the correct download URL
          # to handle spaces and special characters in file names.
          download_url = f"https://raw.githubusercontent.com/{repo_owner}/{repo_name}/main/{file_data['path']}"
          pdf_files.append(download_url)
  return pdf_files

# --- Usage ---
repo_owner = "ishanv13"
repo_name = "NHS-Healthcare-Assistant"
directory_path = "Patient2"

pdf_urls = get_github_files(repo_owner, repo_name, directory_path)

# Create the 'docs' directory if it doesn't exist
os.makedirs("docs", exist_ok=True)

# Download the PDF files
for url in pdf_urls:
    filename = os.path.basename(url)
    wget.download(url, out=os.path.join("docs", filename))
    print(f"Downloaded: {filename}")

Downloaded: JD Current Symptoms 2025.pdf
Downloaded: JD Medical History 2020.pdf
Downloaded: JD Medical History 2021.pdf
Downloaded: JD Medical History 2022.pdf
Downloaded: JD Medical History 2023.pdf
Downloaded: JD Medical History 2024.pdf


In [None]:
!pip install pysqlite3



The user can upload the initial files or report here.

In [None]:
'''import sqlite3
import os

def upload_pdf_to_db(pdf_path, db_name="my_database.db"):
  """
  Uploads PDF content to a SQLite database, storing it in the 'docs' folder.

  Args:
      pdf_path (str): The path to the PDF file.
      db_name (str, optional): The name of the database file. Defaults to "my_database.db".
  """
  with open(pdf_path, "rb") as f:
      pdf_data = f.read()

  conn = sqlite3.connect(db_name)
  cursor = conn.cursor()

  # Create the 'docs' table if it doesn't exist
  cursor.execute("""
      CREATE TABLE IF NOT EXISTS docs (
          id INTEGER PRIMARY KEY AUTOINCREMENT,
          filename TEXT,
          data BLOB
      )
  """)

  # Insert the PDF data into the 'docs' table
  cursor.execute("INSERT INTO docs (filename, data) VALUES (?, ?)", (os.path.basename(pdf_path), pdf_data))

  conn.commit()
  conn.close()

# Example usage within Colab (replace with your file upload method)
from google.colab import files

# Create the directory if it doesn't exist
os.makedirs("/content/docs", exist_ok=True)

uploaded = files.upload()

for filename, data in uploaded.items():
    # Save the file to /content/docs
    file_path = os.path.join("/content/docs", filename)
    with open(file_path, "wb") as f:
        f.write(data)
    upload_pdf_to_db(file_path)  # Use the full path for the database

print("PDF files uploaded to the database (docs table).")'''

'import sqlite3\nimport os\n\ndef upload_pdf_to_db(pdf_path, db_name="my_database.db"):\n  """\n  Uploads PDF content to a SQLite database, storing it in the \'docs\' folder.\n\n  Args:\n      pdf_path (str): The path to the PDF file.\n      db_name (str, optional): The name of the database file. Defaults to "my_database.db".\n  """\n  with open(pdf_path, "rb") as f:\n      pdf_data = f.read()\n\n  conn = sqlite3.connect(db_name)\n  cursor = conn.cursor()\n\n  # Create the \'docs\' table if it doesn\'t exist\n  cursor.execute("""\n      CREATE TABLE IF NOT EXISTS docs (\n          id INTEGER PRIMARY KEY AUTOINCREMENT,\n          filename TEXT,\n          data BLOB\n      )\n  """)\n\n  # Insert the PDF data into the \'docs\' table\n  cursor.execute("INSERT INTO docs (filename, data) VALUES (?, ?)", (os.path.basename(pdf_path), pdf_data))\n\n  conn.commit()\n  conn.close()\n\n# Example usage within Colab (replace with your file upload method)\nfrom google.colab import files\n\n# Create 

In [None]:
!pip install langchain-community



In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

Read all the files in the directory

In [99]:
import os
# Assuming PDFs are in a 'docs' folder
pdf_folder_path = 'docs/'
if not os.path.exists(pdf_folder_path):
    print(f"Error: '{pdf_folder_path}' not found. Please upload your PDFs there.")
else:
    loaders = [PyPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path) if fn.endswith('.pdf')]
    print(f"Found {len(loaders)} PDF documents.")
    docs = []
    for loader in loaders:
        docs.extend(loader.load())
    print(f"Loaded {len(docs)} pages total.")

Found 8 PDF documents.
Loaded 13 pages total.


Split the documents into chunks that are overlapping.


In [100]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=25)
splits = text_splitter.split_documents(docs)
print(f"Split into {len(splits)} chunks.")

Split into 90 chunks.


In [101]:
for i, split in enumerate(splits):
    print(f"Chunk {i + 1}:\n{split.page_content}\n")

Chunk 1:
Synthetic Medical History - John Doe 
John Doe - Full Medical History (2020-2025) 
Extended Synthetic Medical History for John Doe (Age 27, AB+) 
Primary Profile:

Chunk 2:
Primary Profile: 
- Pre-existing Conditions: Asthma (since childhood), IBS (diagnosed 2021), Allergic Rhinitis

Chunk 3:
- Medications: Albuterol (PRN), Cetirizine, Dicyclomine (as needed), Vitamin D3 (since 2024) 
- Lifestyle: Office worker, sedentary with intermittent exercise; non-smoker; moderate alcohol use

Chunk 4:
- Family History: Mother has Type 2 Diabetes, father has hypertension 
Urgent Care Visit – April 2025 
- Complaint: Fever

Chunk 5:
- Complaint: Fever 
(101.8°F), malaise, itchy rash with vesicles starting on trunk, spreading to face and li
mbs over 24 hours.

Chunk 6:
mbs over 24 hours. 
- History: No prior varicella infection or vaccination confirmed; recent contact with co
worker’s child who had “a rash.”

Chunk 7:
- Exam Findings: Multiple small vesicular lesions on erythematous bases,

##Embedding and Indexing

In [None]:
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma

We will use a sentence embedding model

In [None]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
print("Embedding model loaded.")

Embedding model loaded.


Create and populate the vector store

In [None]:
persist_directory = "db"  # Specify the directory for persistence
# Create or load the Chroma vector store, enabling persistence to disk.
vectorstore = Chroma.from_documents(
   documents=splits, embedding=embeddings, persist_directory=persist_directory)
vectorstore.persist()
print(f"Vector store created and populated with embeddings, persisted to {persist_directory}.")


Vector store created and populated with embeddings, persisted to db.


##Gemini LLM Setup

In [None]:
!pip install -q -U google-genai  # Install or update google-genai
!pip install -q -U google-generativeai  # Install or update google-generativeai

from google.colab import userdata
from google import genai

# Set your Google API key (ensure it's stored securely)
GOOGLE_API_KEY = userdata.get('Google_API')
client = genai.Client(api_key=GOOGLE_API_KEY)
MODEL = "gemini-2.0-flash"

SecretNotFoundError: Secret Google_API does not exist.

In [None]:
def answer_with_gemini(query):
    """
    Retrieves semantically similar chunks and uses Gemini to answer the query.

    Args:
        query (str): The user's question.

    Returns:
        str: Gemini's answer to the question.
    """

    # 1. Retrieval of semantically similar chunks:
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    retrieved_docs = retriever.invoke(query)

    # 2. Construct the prompt for Gemini:
    context = ""
    for doc in retrieved_docs:
        context += doc.page_content

    # System instructions for summarizing and structuring
    system_instructions = """
    You are a medical AI specialist trained to analyze both structured and unstructured patient health records. Based on the provided {context}, you must generate two distinct outputs:
    Only Give the output in the specified format and don't include the questions in the output:
    \n'Output 1: Patient-Facing Explanation'
    'Answer:
    1.
    2.
    3. '

    'Output 2: Doctor-Facing Summary'
    'Answer:
    1.
    2.
    3. '
    """

    prompt = f"""{system_instructions}

    Output 1: Patient-Facing Explanation

    Use simplified, empathetic language suitable for a non-medical audience. You are a medical assistant explaining to a patient.

    Prompt Format:

    You are a medical assistant explaining to a patient.
    Read the following medical record carefully and then answer the questions below in full sentences.

    {context}

    Questions:
    1. What is the patient’s main health issue?
    2. What do the test results indicate?
    3. {user_question}?

    Answer:

    ⸻

    Output 2: Doctor-Facing Summary

    Use formal medical language appropriate for a professional healthcare provider. Provide evidence-based insights and include clinical rationale when relevant.

    Prompt Format:

    You are a clinical assistant. Read the following patient record:

    {context}

    Now answer the questions in complete sentences, referencing the record:
    1. What is the primary diagnosis?
    2. What are the significant lab or imaging findings?
    3. What treatment plan is recommended, and what guidelines support it?

    Answer:

    """

    # 3. Generate the answer using Gemini:
    response = client.models.generate_content(
        model=MODEL,
        contents=prompt
    )

    return response.text

Uploading A new document to the database.

In [None]:
import os
import sqlite3
from google.colab import files
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma


def update_db_with_pdf(pdf_path, db_name="my_database.db", persist_directory="db"):
    """Updates the Chroma vector database with a new PDF file."""

    # 1. Upload PDF to the database (if not already present).
    try:
        upload_pdf_to_db(pdf_path, db_name)
        print(f"PDF '{pdf_path}' added to the database.")
    except sqlite3.IntegrityError:
        print(f"PDF '{pdf_path}' already exists in the database.")


    # 2. Load and process the new PDF.
    loader = PyPDFLoader(pdf_path)
    new_docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=25)
    new_splits = text_splitter.split_documents(new_docs)
    print(f"Loaded and split {len(new_splits)} chunks from the new PDF.")


    # 3. Update the Chroma vector store.
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    vectorstore.add_documents(new_splits)
    vectorstore.persist()
    print("Vectorstore updated with the new PDF content.")


# Example usage (assuming you have a PDF file uploaded to '/content/docs'):

#new_pdf_path = "/content/docs/your_new_pdf.pdf"  # Replace with actual path
#update_db_with_pdf(new_pdf_path)


# --- Helper functions from original code (slightly modified) ---

def upload_pdf_to_db(pdf_path, db_name="my_database.db"):
    """Uploads PDF content to a SQLite database."""
    with open(pdf_path, "rb") as f:
        pdf_data = f.read()
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS docs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT UNIQUE,  -- Enforce unique filenames
            data BLOB
        )
    """)
    try:  # Handle potential IntegrityError if the filename exists
        cursor.execute("INSERT INTO docs (filename, data) VALUES (?, ?)", (os.path.basename(pdf_path), pdf_data))
        conn.commit()
    except sqlite3.IntegrityError:
        print(f"File '{os.path.basename(pdf_path)}' already in database. Skipping.")
    conn.close()


# --- Code for file upload and database update ---

uploaded = files.upload()
for filename, data in uploaded.items():
  file_path = os.path.join("/content/docs", filename)
  with open(file_path, "wb") as f:
    f.write(data)
  update_db_with_pdf(file_path)


Saving John_Doe_Medical_History_Integrated_removed.pdf to John_Doe_Medical_History_Integrated_removed.pdf
PDF '/content/docs/John_Doe_Medical_History_Integrated_removed.pdf' added to the database.
Loaded and split 34 chunks from the new PDF.


  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)


Vectorstore updated with the new PDF content.


##Medical Assistant
Ask questions based on newly uploaded data.

In [None]:
user_question = "Should I go visit a doctor right now?" #@param {type:"string"}
answer = answer_with_gemini(user_question)
print(f"Answer: {answer}")

NameError: name 'client' is not defined