In [1]:
from sentence_transformers import SentenceTransformer
import faiss
import pandas as pd
import numpy as np
from openai import OpenAI
from rouge import Rouge
import time
from sklearn.model_selection import train_test_split
import ollama
import warnings

warnings.filterwarnings("ignore")

2025-05-04 20:47:22.227276: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-04 20:47:22.412058: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746409642.499588    2848 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746409642.526339    2848 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746409642.688594    2848 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
df.head(2)

Unnamed: 0,Master_Index,County,Health level,Years of Experience,Prompt,Nursing Competency,Clinical Panel,Clinician,GPT4.0,LLAMA,GEMINI,DDX SNOMED
0,ID_VBWWP,uasin gishu,sub county hospitals and nursing homes,18.0,i am a nurse with 18 years of experience in ge...,pediatric emergency burns,surgery,summary a 4 year old with 5 superficial burns ...,given your vast experience as a nurse in uasin...,1 immediate treatment protocol for second degr...,here s a response addressing the questions reg...,288514009 burn involving 5 percent of body sur...
1,ID_XMBBY,uasin gishu,national referral hospitals,17.0,i am a nurse with 17 years of experience in ge...,child health,paediatrics,summary 6 year old present with vomiting and a...,clinical summary • a 6 year old girl with know...,based on the symptoms and signs you ve describ...,based on the presentation the 6 year old girl ...,420270002 ketoacidosis due to type 1 diabetes ...


In [4]:
df = df.fillna(round(df['Years of Experience'].mean(),2))
df.rename(columns=lambda x: x.strip().replace(" ", ""), inplace=True)

In [5]:
# ---------------------------
# STEP 2: Generate Embeddings for Prompts
# ---------------------------
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
df["embedding"] = df["Prompt"].apply(lambda x: embed_model.encode(x))

embedding_matrix = np.vstack(df["embedding"].values)

In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
# ---------------------------
# STEP 3: Build FAISS Index
# ---------------------------
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)


In [8]:
# ---------------------------
# STEP 4: Retrieval Function with Metadata Filtering
# ---------------------------
def retrieve_similar_examples(df, query_prompt, years_experience, competency, panel, county, health_level, top_k=5):
    # Step 1: Filter dataset
    filtered_df = df[
        (df["YearsofExperience"].between(years_experience - 2, years_experience + 2)) &
        (df["NursingCompetency"] == competency) &
        (df["ClinicalPanel"] == panel) &
        (df["County"] == county) &
        (df["Healthlevel"] == health_level)
    ]
    
    if len(filtered_df) == 0:
        print("[Warning] No exact metadata matches. Falling back to full dataset.")
        filtered_df = df

    query_embedding = embed_model.encode([query_prompt])

    sub_indices = filtered_df.index.tolist()
    sub_embeddings = np.vstack(filtered_df["embedding"].values)
    temp_index = faiss.IndexFlatL2(sub_embeddings.shape[1])
    temp_index.add(sub_embeddings)
    _, retrieved_idx = temp_index.search(np.array(query_embedding), top_k)

    result_indices = [sub_indices[i] for i in retrieved_idx[0]]
    return df.loc[result_indices]


In [9]:
# ---------------------------
# STEP 5: Build Prompt for LLM
# ---------------------------
def build_few_shot_prompt(examples, query_meta):
    # print(examples.columns)
    prompt = ""
    for _, row in examples.iterrows():
        prompt += (
            f"Example Case:\n"
            f"Years of Experience: {row['YearsofExperience']}\n"
            f"Nursing Competency: {row['NursingCompetency']}\n"
            f"Clinical Panel: {row['ClinicalPanel']}\n"
            f"County: {row['County']}\n"
            f"Health Level: {row['Healthlevel']}\n"
            f"Prompt: {row['Prompt']}\n"
            f"Clinician Response: {row['Clinician']}\n\n"
        )

    prompt += (
        f"Now, complete the response to the following case:\n"
        f"Years of Experience: {query_meta['YearsofExperience']}\n"
        f"Nursing Competency: {query_meta['NursingCompetency']}\n"
        f"Clinical Panel: {query_meta['ClinicalPanel']}\n"
        f"County: {query_meta['County']}\n"
        f"Health Level: {query_meta['Healthlevel']}\n"
        f"Prompt: {query_meta['Prompt']}\n"
        f"Clinician Response: "
    )
    return prompt


In [13]:

# ---------------------------
# EXAMPLE USAGE
# ---------------------------
system_instruction = (
            "You are a licensed clinician in Kenya. "
            "Provide answers using a professional format including sections like 'Summary', 'Diagnosis', and 'Plan'. But you must follow the format of the example provided for the clinician"
    )

query = {
    "Prompt": "I am a nurse with 2 years of experience in General nursing working in a Dispensaries and Private Clinics in Kakamega county in Kenya. A three-month-old baby brought to the facility for immunization, which is due today. On exam, the baby was pale and febrile. The baby was crying a lot. Should I treat the baby first or just give the vaccine and treat later?",
    "YearsofExperience": 2,
    "NursingCompetency": "Child Health",
    "ClinicalPanel": "PAEDIATRICS",
    "County": "Kakamega",
    "Healthlevel": "Dispensaries and Private Clinics"
}

retrieved = retrieve_similar_examples(
    train_df,
    query["Prompt"], query["YearsofExperience"], query["NursingCompetency"],
    query["ClinicalPanel"], query["County"], query["Healthlevel"]
)

few_shot_prompt = build_few_shot_prompt(retrieved, query)

print("\n\n====== FEW-SHOT PROMPT FOR LLM ======\n")
print("few_shot_prompt done...")
print("\n====================================\n")

full_prompt = system_instruction + "\n\n" + few_shot_prompt

response = ollama.chat(
    model="qwen3:8b",
    messages=[
        {"role": "user", "content": full_prompt}
    ],
    stream=False
)
generated = response['message']['content'].strip()
print(generated)






few_shot_prompt done...


<think>
Okay, the user is a nurse in Kakamega county with 2 years of experience. They have a three-month-old baby coming for immunization, but the baby is pale, febrile, and crying a lot. The question is whether to treat first or give the vaccine and treat later.

First, I need to recall the standard protocols for immunization in Kenya. From previous examples, when a child has a mild infection with fever under 38°C, vaccines can be given after recovery. If fever is above 38°C, delay for 72 hours after treatment. The example case mentioned that high fever might denature vaccine proteins and the child is at risk due to low immunity.

In this case, the baby has a fever (presumably 38°C or higher since they mentioned "febrile"), so according to the protocol, the vaccine should be postponed. The nurse should treat the fever first. The summary should mention the diagnosis of possible upper respiratory tract infection or another mild infection. The plan would incl

In [11]:
rouge = Rouge()
results = []

In [12]:

# for idx, row in test_df.iterrows():
#     query = {
#         "Prompt": row["Prompt"],
#         "YearsofExperience": row["YearsofExperience"],
#         "NursingCompetency": row["NursingCompetency"],
#         "ClinicalPanel": row["ClinicalPanel"],
#         "County": row["County"],
#         "Healthlevel": row["Healthlevel"],
#     }

#     try:
#         examples = retrieve_similar_examples(
#             train_df,
#             query["Prompt"], query["YearsofExperience"], query["NursingCompetency"],
#             query["ClinicalPanel"], query["County"], query["Healthlevel"]
#         )


#         few_shot_prompt = build_few_shot_prompt(examples, query)

#         full_prompt = system_instruction + "\n\n" + few_shot_prompt

#         response = ollama.chat(
#             model="qwen3:8b",
#             messages=[
#                 {"role": "user", "content": full_prompt}
#             ],
#             stream=False
#         )
#         generated = response['message']['content'].strip()

#         scores = rouge.get_scores(generated, row["Clinician"])[0]
#         results.append({
#             "idx": idx,
#             "generated": generated,
#             "reference": row["Clinician"],
#             "rouge-1": scores["rouge-1"]["f"],
#             "rouge-2": scores["rouge-2"]["f"],
#             "rouge-l": scores["rouge-l"]["f"],
#         })

#     except Exception as e:
#         print(f"[ERROR] at index {idx}: {e}")
#         continue

#     time.sleep(0.5)  # throttle in case of system strain

# results_df = pd.DataFrame(results)
# print("Average ROUGE-1:", results_df['rouge-1'].mean())
# print("Average ROUGE-2:", results_df['rouge-2'].mean())
# print("Average ROUGE-L:", results_df['rouge-l'].mean())
