<h1> Differential Diagnosis with Mistral 7B RAG vs. BioMistral 7B by ContactDoctor

In [1]:
!pip install -q streamlit langchain_community chromadb huggingface-hub bitsandbytes pypdf tiktoken

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
flask-oauthlib 0.9.6 requires oauthlib!=2.0.3,!=2.0.4,!=2.0.5,<3.0.0,>=1.1.2, but you have oauthlib 3.2.2 which is incompatible.
flask-oauthlib 0.9.6 requires requests-oauthlib<1.2.0,>=0.6.2, but you have requests-oauthlib 2.0.0 which is incompatible.
flet 0.7.4 requires httpx<0.24.0,>=0.23.3, but you have httpx 0.28.1 which is incompatible.
flet 0.7.4 requires watchdog<3.0.0,>=2.2.1, but you have watchdog 4.0.1 which is incompatible.
langchain-mistralai 0.2.10 requires langchain-core<1.0.0,>=0.3.49, but you have langchain-core 0.1.53 which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
os.makedirs('.streamlit', exist_ok=True)
with open('.streamlit/secrets.toml', 'w') as f:
    f.write("""
[huggingface]
token = "secret_token"

[models]
rag = "mistralai/Mistral-7B-Instruct-v0.2"
bio = "BioMistral/BioMistral-7B"
""".lstrip())

In [3]:
%%writefile app.py

# IMPORT LIBRARY
import streamlit as st
import pandas as pd
import os
import torch
import re

# FOR PARALLELIZATION
from concurrent.futures import ThreadPoolExecutor, as_completed

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain import PromptTemplate
from sentence_transformers import SentenceTransformer

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import threading
import time
from tenacity import retry, stop_after_attempt, wait_fixed




# CODE BLOCK

PROMPT = """Answer the question based only on the following context,:{context}
Question:{question}
What are the top 10 most likely diagnoses? Be precise, listing one diagnosis per line, and try to cover many unique possibilities.
Ensure the order starts with the most likely. The top 10 diagnoses are."""
MAX_INPUT_TOKENS = 2048 # The sequence length limit of BioMistral-7V
DB_DIR = "./db_im"

HF_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
PIPELINE_DEVICE = 0 if torch.cuda.is_available() else -1
HF_TOKEN    = st.secrets["huggingface"]["token"]
model_id    = st.secrets["models"]["rag"]
bio_model_id= st.secrets["models"]["bio"]



bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
)


if torch.cuda.is_available():
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

### HELPERS TO LOAD THE MODEL ###

#lock to serialize any “move‐model‐on/off GPU” calls
gpu_lock = threading.Lock()

def unload_model_from_gpu(model):
    """Explicitly moves model to CPU and clears CUDA cache"""
    if hasattr(model, "to"):
        model.to("cpu")
    torch.cuda.empty_cache()

def safe_invoke(model_or_chain, *args, **kwargs):
    try:
        if hasattr(model_or_chain, "invoke"):
            return model_or_chain.invoke(*args, **kwargs)
        return model_or_chain(*args, **kwargs)

    except Exception as e:
        st.error(f"MODEL ERROR: {e}")
        return None


def choose_specialty(current_case, pipe, prompt_specialty):
    """ This function chooses the specialty that a medical case belongs to automatically.
    It takes a model (model instance) and the case (str).
    The model = Ollama(model="mistral") was used in testing. It returns
    the directory of the vector database as a string"""

    prompt_specialty = prompt_specialty.format_prompt(current_case=current_case).to_string()

    response_specialty = safe_invoke(pipe, prompt_specialty, max_new_tokens=128)

    raw_specialty = response_specialty[0]["generated_text"]
    # strip prompt echo
    specialty_out = raw_specialty[len(prompt_specialty):].lstrip() if raw_specialty.startswith(prompt_specialty) else raw_specialty

    db_dict = {'internal medicine': './db_im', 'obstetrics and gynecology': './db_og', 'pediatrics':'./db_p','surgery':'./db_surg','psychiatry':'./db_psy'}

    result_specialty = re.sub(r'\d\.', '', specialty_out).strip().lower()
    return db_dict[result_specialty]


### CACHING HEAVY RESOURCES ###

@st.cache_resource(show_spinner=False)
def get_embedding_fn():
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs = {"device": HF_DEVICE})



# Load Mistral 7B RAG
@st.cache_resource(show_spinner=False)
def get_rag_components(txt):


    mod = AutoModelForCausalLM.from_pretrained(model_id,
                                               use_auth_token= HF_TOKEN,
                                               device_map='auto' if HF_DEVICE=="cuda" else "cpu",
                                               torch_dtype= torch.bfloat16 if HF_DEVICE=="cuda" else torch.float32,
                                               quantization_config=bnb_config)

    pipe  = pipeline(
        "text-generation",
        model=mod,
        tokenizer=AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN),
        #device="cpu",
        use_fast=True,
        max_new_tokens=4,
    )

    prompt = PromptTemplate(template=PROMPT, input_variables=["context", "question"])
    PROMPT_specialty_template = """"{current_case} What is the medical specialty of this case? Choose from this list 1. Internal Medicine, 2. Obstetrics and Gynecology, 3. Pediatrics, 4. Surgery 5. Psychiatry\n" """


    prompt_specialty = PromptTemplate(template=PROMPT_specialty_template, input_variables=["current_case"])

    DB_DIR = choose_specialty(txt, pipe, prompt_specialty)


    vs = Chroma(
    embedding_function=get_embedding_fn(),
    persist_directory=DB_DIR,
    )
    pipe  = pipeline(
        "text-generation",
        model=mod,
        tokenizer=AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN),
        #device="cpu",
        use_fast=True,
        max_new_tokens=256,
    )
    retriever  = vs.as_retriever()
    return pipe, retriever, prompt


# Load Bio Model
@st.cache_resource(show_spinner=False)
def get_bio_pipeline():

    bio_mod     = AutoModelForCausalLM.from_pretrained(bio_model_id,
                                               use_auth_token= HF_TOKEN,
                                               device_map='auto' if HF_DEVICE=="cuda" else "cpu",
                                               torch_dtype= torch.bfloat16 if HF_DEVICE=="cuda" else torch.float32,
                                               quantization_config=bnb_config)


    bio_pipe  =  pipeline(
        "text-generation",
        model=bio_mod,
        tokenizer=AutoTokenizer.from_pretrained(bio_model_id, use_auth_token=HF_TOKEN),
        use_fast=True,
        max_new_tokens=256,
    )

    return bio_pipe

@st.cache_resource
def get_tokenizer():
    return AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN)


# Tricks for Expensive File I/O: streamlit caching
@st.cache_data(show_spinner=False)
def build_vectorstore(uploaded_files, persist_directory):
    folder_dict = {'./db_im': 'im_folder', './db_og': 'og_folder', './db_p':'p_folder','./db_surg':'surg_folder','./db_psy':'psy_folder'}

    UploadedTextbook = folder_dict[persist_directory]

    os.makedirs(UploadedTextbook, exist_ok=True)
    paths = []
    for f in uploaded_files:
        path = os.path.join(UploadedTextbook, f.name)
        with open(path, "wb") as fp:
            fp.write(f.getbuffer())
        paths.append(path)

    docs = []
    for pdf in paths:
        docs.extend(PyPDFLoader(pdf).load())
    splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1500, chunk_overlap=200
    )
    splits = splitter.split_documents(docs)

    vs = Chroma.from_documents(
        splits,
        get_embedding_fn(),
        persist_directory=persist_directory,
    )
    vs.persist()
    return True

### HELPERS ###

# Make sure the token inputs are within the limit
def check_length(text):
    tok = get_tokenizer()
    token_count = len(tok.encode(text))
    if token_count > MAX_INPUT_TOKENS:
        st.warning(f"Your input is {token_count} tokens, over the {MAX_INPUT_TOKENS}-token limit. Please shorten it.")
        return False
    return True

def safe_invoke(model_or_chain, *args, **kwargs):
    try:
        if hasattr(model_or_chain, "invoke"):
            return model_or_chain.invoke(*args, **kwargs)
        return model_or_chain(*args, **kwargs)

    except Exception as e:
        st.error(f"MODEL ERROR: {e}")
        return None


def process_case(txt):
    # 1) Retrieve context
    try:
      rag_pipe, rag_retriever, prompt = get_rag_components(txt)
      docs    = rag_retriever.get_relevant_documents(txt)
      context = "\n\n".join(d.page_content for d in docs)

      # 2) Generate with RAG‐LLM
      prompt_text = prompt.format_prompt(context=context, question=txt).to_string()

      with gpu_lock:
        rag_out_raw = safe_invoke(rag_pipe, prompt_text, max_new_tokens=128)
        #clear cache
        torch.cuda.empty_cache()

      if rag_out_raw:
          raw = rag_out_raw[0]["generated_text"]
          # strip prompt echo
          rag_out = raw[len(prompt_text):].lstrip() if raw.startswith(prompt_text) else raw
      else:
          rag_out = "Error"

      # 3) Generate with Bio‐LLM
      bio_pipe    = get_bio_pipeline()
      bio_prompt  = PROMPT.format(context="", question=txt)
      with gpu_lock:
        bio_raw = safe_invoke(bio_pipe, bio_prompt, max_new_tokens=128)
        #clear cache
        torch.cuda.empty_cache()

      if bio_raw:
          raw_b = bio_raw[0]["generated_text"]
          bio_out = raw_b[len(bio_prompt):].lstrip() if raw_b.startswith(bio_prompt) else raw_b
      else:
          bio_out = "Error"

      return {"Case": txt, "Mistral7B+RAG": rag_out, "BioMistral7B": bio_out}

    except Exception as e:
        return {"Case": txt, "Mistral7B+RAG": f"ERROR: {e}", "BioMistral7B": ""}

# Then when generating, temporarily move to GPU:
def generate_bio(bio_pipe, prompt):
  with gpu_lock:
    # move to GPU
    bio_pipe.model.to("cuda")
    out = bio_pipe(prompt)[0]["generated_text"]
    # move back to CPU to free VRAM
    bio_pipe.model.to("cpu")
  return out




### STREAMLIT UI ###
st.title("Differential Diagnosis: Mistral 7B RAG vs. Bio Mistral 7B by BioMistral")
st.caption("Helps the doctor/nurse to develop their differential diagnosis using LLM models")

# Additional files
with st.sidebar:
    db_dict = {'internal medicine': './db_im', 'obstetrics and gynecology': './db_og', 'pediatrics':'./db_p','surgery':'./db_surg','psychiatry':'./db_psy'}
    persist_directory = "./db_im"
    specialty = st.selectbox("Choose Specialty", ('internal medicine', 'obstetrics and gynecology', 'pediatrics','surgery','psychiatry'))
    persist_directory = db_dict[specialty]
    # if st.button("Set Specialty"):
    #     persist_directory = db_dict[specialty]
    #     print(persist_directory)

    st.header("Upload additional resources for RAG (type:.pdf)")
    UploadedFiles = st.file_uploader("Upload here and click on 'Upload'", type="pdf", accept_multiple_files=True)

    if st.button("Build Index"):
        if not UploadedFiles:
            st.error("Select at least one PDF first.")
        else:
            with st.spinner("Indexing…"):
                build_vectorstore(UploadedFiles, persist_directory)
            st.success("RAG index is ready!")

    st.markdown("---")
    st.header("Batch processing case upload (type:.csv)")
    csv_file = st.file_uploader(
        "Upload CSV",
        type="csv",
        accept_multiple_files=False)

### SINGLE CASE ###
st.subheader("SINGLE CASE")
question = st.text_area("Case Narrative:",
                        height=180,
                        placeholder="For example: 22-year-old patient with TB was admitted to hospital today. The patient has been to a country outside Sweden. The patient came back to Sweden from the other country. The patient has had a fever for two weeks and is admitted. The doctor has prescribed a medicine. ")
st.write(f"The number of characters are {len(question)} characters.")

if st.button('Start Processing'):
  if check_length(question):
    with st.spinner("Processing..."):

      tabs = st.tabs(["BIOMode", "RAGMode"])

      with tabs[0]:
        #Biomodel execution
        bio_pipe = get_bio_pipeline()
        bio_prompt = PROMPT.format(context="",question=question)

        with gpu_lock:
          raw_bio = bio_pipe(bio_prompt)[0]["generated_text"]
          torch.cuda.empty_cache()

        #remove the prompt echo
        if raw_bio.startswith(bio_prompt):
            bio_output = raw_bio[len(bio_prompt):].lstrip()
        else:
            bio_output = raw_bio

        st.markdown("**BioMistral 7B**")
        st.text(bio_output)

      with tabs[1]:
        if not os.path.isdir(DB_DIR) or not os.listdir(DB_DIR):
          st.error("Please upload and build your PDF index first!")
          st.stop()

        rag_pipe, rag_retriever, prompt = get_rag_components(question)
        docs = rag_retriever.get_relevant_documents(question)
        context = "\n\n".join([d.page_content for d in docs])
        prompt_text = prompt.format_prompt(context=context, question=question).to_string()

        st.markdown("**Mistral 7B + RAG**")

        with gpu_lock:
          raw = rag_pipe(prompt_text)[0]["generated_text"]
          torch.cuda.empty_cache()


        #remove the prompt echo
        if raw.startswith(prompt_text):
            answer = raw[len(prompt_text):].lstrip()
        else:
            answer = raw
        st.text(answer)

        #free up memory from RAG
        del rag_pipe, rag_retriever, prompt
        torch.cuda.empty_cache()

  else:
    st.stop()

### BATCH PROCESSING ###
st.markdown("---")
st.subheader("BATCH MODE")

if csv_file:
    df = pd.read_csv(csv_file)
    if st.button("Start Batch Processing"):
        results = []
        futures = []
        prog = st.progress(0)

        BATCH_WORKERS = 1 if HF_DEVICE != "cuda" else 2
        with ThreadPoolExecutor(max_workers=BATCH_WORKERS) as exe:
            for txt in df["Case"]:
                futures.append(exe.submit(process_case, txt))

            # as each case completes, update progress
            for i, fut in enumerate(as_completed(futures)):
                results.append(fut.result())
                prog.progress((i + 1) / len(futures))

        out_df = pd.DataFrame(results)
        st.download_button(
            "Download Results as CSV",
            data=out_df.to_csv(index=False),
            file_name="ddx_comparison.csv"
        )

Writing app.py


<h2>Install local-tunnel </h2>

In [4]:
!npm install localtunnel

'npm' is not recognized as an internal or external command,
operable program or batch file.


<h2> Run Streamlit in background </h2>

In [5]:
# AND Expose to the port 8501
!streamlit run /content/app.py &>/content/logs.txt & npx localtunnel --port 8501 & curl ipv4.icanhazip.com

& was unexpected at this time.
