<a href="https://colab.research.google.com/github/kanchanraiii/AIML-Project-Series/blob/main/Combined_Generalized_(Gemini_%2B_MiniLm%2BBERT).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
!pip install -q faiss-cpu sentence-transformers google-generativeai spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [36]:
import os
import json
import numpy as np
import faiss
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import spacy
import re
from google.colab import files

In [37]:
os.environ["GOOGLE_API_KEY"] = input("🔑 Enter your Google Gemini API Key (or press Enter to skip): ")
if os.environ["GOOGLE_API_KEY"]:
    genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

🔑 Enter your Google Gemini API Key (or press Enter to skip): AIzaSyDvDoLjfp7QJuX2UpNQ4G2c49iaZAIRblY


In [38]:
nlp = spacy.load("en_core_web_sm")

In [39]:
# --- Regex patterns for sensitive data ---
REGEX_PATTERNS = {
    "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "PHONE": r"\+?\d{7,15}",
    "DOB": r"\b\d{4}-\d{2}-\d{2}\b",
    "SSN": r"\b\d{3}-\d{2}-\d{4}\b"
}

In [40]:
# --- Upload your .jsonl file ---
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# --- Parse JSONL into text docs ---
docs = []
with open(filename, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        record_text = f"""
        Patient Name: {obj.get('patient_name', '')}
        Patient ID: {obj.get('patient_id', '')}
        DOB: {obj.get('dob', '')}
        Diagnosis: {obj.get('diagnosis', '')}
        Prescriptions: {obj.get('prescriptions', [])}
        Lab Reports: {obj.get('lab_reports', [])}
        Email: {obj.get('email', '')}
        Phone: {obj.get('phone', '')}
        Address: {obj.get('address', '')}
        """
        docs.append(record_text.strip())

print(f"✅ Loaded {len(docs)} records")
print("🔹 Example:\n", docs[0][:300])


Saving healthcare_dataset.jsonl to healthcare_dataset (1).jsonl
✅ Loaded 10000 records
🔹 Example:
 Patient Name: Ayush Dugal
        Patient ID: PID77302
        DOB: 2015A01-22
        Diagnosis: Bronchitis
        Prescriptions: [{'medicine': 'Perspiciatis', 'dosage': '2 tablets 1 times a day'}, {'medicine': 'Deleniti', 'dosage': '1 tablets 1 times a day'}, {'medicine': 'Aut', 'dosage': '2 tabl


In [41]:
# --- Embedding model ---
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embed_model.encode(docs)
embeddings = np.array(embeddings).astype("float32")

# --- Build FAISS index ---
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

print(f"✅ FAISS index built with {len(docs)} records")


✅ FAISS index built with 10000 records


In [42]:
def input_filter(query: str, threshold: float = 0.5):
    """Check query for sensitive info before sending to RAG."""
    doc = nlp(query)
    pii_count = sum(1 for ent in doc.ents if ent.label_ in ["PERSON", "GPE", "LOC", "ORG"])
    regex_count = sum(len(re.findall(p, query)) for p in REGEX_PATTERNS.values())

    if (pii_count + regex_count) / max(1, len(query.split())) > threshold:
        return True, "❌ Query blocked (contains sensitive info)."
    return False, "✅ Query is safe."

def output_filter(response: str):
    """Redact sensitive info from LLM output."""
    doc = nlp(response)
    redacted = response
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "GPE", "LOC", "ORG"]:
            redacted = redacted.replace(ent.text, f"[REDACTED_{ent.label_}]")

    for k, p in REGEX_PATTERNS.items():
        redacted = re.sub(p, f"[REDACTED_{k}]", redacted)

    return redacted


In [43]:
def search_faiss(query, k=3):
    """Retrieve top-k docs from FAISS."""
    q_vec = embed_model.encode([query]).astype("float32")
    distances, indices = index.search(q_vec, k)
    return [docs[i] for i in indices[0]]

def rag_pipeline(query, model_choice="gemini"):
    """
    Run RAG with chosen model.
    model_choice = "gemini" or "minilm"
    """
    print(f"\n🔎 User Query: {query}")
    print(f"⚙️ Using model: {model_choice.upper()}")

    # --- Input Filter ---
    blocked, msg = input_filter(query)
    if blocked:
        print(msg)
        return msg

    # --- Retrieve from FAISS ---
    retrieved_docs = search_faiss(query, k=2)
    context = "\n".join(retrieved_docs)

    # --- Answer with Gemini ---
    if model_choice == "gemini":
        if not os.environ["GOOGLE_API_KEY"]:
            return "❌ Gemini API key not found!"
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
        response = model.generate_content(prompt).text

    # --- Answer with MiniLM (local fallback) ---
    elif model_choice == "minilm":
        # simple extractive answer: return most relevant chunk
        response = context

    else:
        return "❌ Invalid model choice. Use 'gemini' or 'minilm'."

    # --- Output Filter ---
    safe_response = output_filter(response)

    print("\n📝 Raw Response:", response)
    print("\n🛡️ Redacted Response:", safe_response)

    return safe_response


In [44]:
# --- Try with Gemini ---
ans1 = rag_pipeline("What medicine is prescribed for Bronchitis?", model_choice="gemini")

# --- Try with MiniLM fallback ---
ans2 = rag_pipeline("What medicine is prescribed for Bronchitis?", model_choice="minilm")



🔎 User Query: What medicine is prescribed for Bronchitis?
⚙️ Using model: GEMINI

📝 Raw Response: Labore, Ex, Illo, and Harum are prescribed for Bronchitis.


🛡️ Redacted Response: Labore, Ex, [REDACTED_ORG], and [REDACTED_PERSON] are prescribed for [REDACTED_GPE].


🔎 User Query: What medicine is prescribed for Bronchitis?
⚙️ Using model: MINILM

📝 Raw Response: Patient Name: Harinakshi Raju
        Patient ID: PID74882
        DOB: 1958-01-10
        Diagnosis: Bronchitis
        Prescriptions: [{'medicine': 'Labore', 'dosage': '1 tablets 1 times a day'}, {'medicine': 'Ex', 'dosage': '1 tablets 2 times a day'}]
        Lab Reports: [{'test': 'X-Ray', 'date': '2025-02-17', 'result': 'Abnormal'}]
        Email: yochana55@exaHple.org
        Phone: 03422630289
        Address: H.No. 17, Parmer Zila, North Dumdum 789712
Patient Name: Chakradhar Dora
        Patient ID: PID37346
        DOB: 2023-10-22
        Diagnosis: Bronchitis
        Prescriptions: [{'medicine': 'Illo', 'dosage': '

In [45]:
from sklearn.metrics import precision_score, recall_score, f1_score

# --- Step 1: Load JSONL again (docs already used for FAISS, but we re-read here) ---
with open(filename, "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]

print(f"✅ Loaded {len(dataset)} records for filter evaluation")

# --- Step 2: Build gold labels from structured fields ---
gold_sensitive_texts = []
raw_texts = []

for record in dataset:
    # Collect sensitive fields that *must* be redacted
    pii_fields = {
        "EMAIL": record.get("email", ""),
        "PHONE": record.get("phone", ""),
        "DOB": record.get("dob", ""),
        "PERSON": record.get("patient_name", ""),
    }

    for label, value in pii_fields.items():
        if value:  # only consider non-empty
            gold_sensitive_texts.append(f"[REDACTED_{label}]")
            raw_texts.append(f"{label}: {value}")

# --- Step 3: Apply output filter to raw texts ---
filtered_texts = [output_filter(text) for text in raw_texts]

# --- Step 4: Convert to binary labels ---
y_true, y_pred = [], []
for gold, raw, filt in zip(gold_sensitive_texts, raw_texts, filtered_texts):
    y_true.append(1)   # expect redaction
    if gold in filt:
        y_pred.append(1)  # correctly redacted
    else:
        y_pred.append(0)  # leaked

# --- Step 5: Metrics ---
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
leakage_rate = 1 - recall

print("🔎 Filter Evaluation on Full Dataset")
print("Precision:", round(precision, 2))
print("Recall:", round(recall, 2))
print("F1-score:", round(f1, 2))
print("Leakage Rate:", round(leakage_rate, 2))

# --- Step 6: Show some before/after examples ---
print("\n📂 Examples (before -> after):")
for i in range(5):
    print(f"\nRaw:      {raw_texts[i]}")
    print(f"Filtered: {filtered_texts[i]}")


✅ Loaded 10000 records for filter evaluation
🔎 Filter Evaluation on Full Dataset
Precision: 1.0
Recall: 0.84
F1-score: 0.91
Leakage Rate: 0.16

📂 Examples (before -> after):

Raw:      EMAIL: mannyashoda@example.org
Filtered: EMAIL: [REDACTED_EMAIL]

Raw:      PHONE: 03088767595
Filtered: PHONE: [REDACTED_PHONE]

Raw:      DOB: 2015A01-22
Filtered: [REDACTED_ORG]: 2015A01-22

Raw:      PERSON: Ayush Dugal
Filtered: PERSON: Ayush Dugal

Raw:      EMAIL: mgera@example.org
Filtered: EMAIL: [REDACTED_EMAIL]


In [46]:
user_query = input("Enter your query: ")

# --- Try with Gemini ---
ans1 = rag_pipeline(user_query, model_choice="gemini")

# --- Try with MiniLM fallback ---
ans2 = rag_pipeline(user_query, model_choice="minilm")

print("\n--- RAG Pipeline Results ---")
print("\nGemini Response (Redacted):")
print(ans1)
print("\nMiniLM Response (Redacted):")
print(ans2)

Enter your query: Which patient has bronchitis

🔎 User Query: Which patient has bronchitis
⚙️ Using model: GEMINI

📝 Raw Response: Both Harinakshi Raju and Shivani Bh0tnagar have bronchitis.


🛡️ Redacted Response: Both [REDACTED_PERSON] and [REDACTED_GPE] have bronchitis.


🔎 User Query: Which patient has bronchitis
⚙️ Using model: MINILM

📝 Raw Response: Patient Name: Harinakshi Raju
        Patient ID: PID74882
        DOB: 1958-01-10
        Diagnosis: Bronchitis
        Prescriptions: [{'medicine': 'Labore', 'dosage': '1 tablets 1 times a day'}, {'medicine': 'Ex', 'dosage': '1 tablets 2 times a day'}]
        Lab Reports: [{'test': 'X-Ray', 'date': '2025-02-17', 'result': 'Abnormal'}]
        Email: yochana55@exaHple.org
        Phone: 03422630289
        Address: H.No. 17, Parmer Zila, North Dumdum 789712
Patient Name: Shivani Bh0tnagar
        Patient ID: PID74731
        DOB: 2015-12-09
        Diagnosis: Bronchitis
        Prescriptions: [{'medicine': 'Repellat', 'dosage': '2 

### **Comparing BERT, GEMINI 1.5 Flash, MiniLM**

In [47]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

bert_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModelForQuestionAnswering.from_pretrained(bert_model_name)

print(f"✅ Loaded BERT model: {bert_model_name}")

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✅ Loaded BERT model: bert-large-uncased-whole-word-masking-finetuned-squad


In [None]:
def rag_pipeline(query, model_choice="gemini"):
    """
    Run RAG with chosen model.
    model_choice = "gemini", "minilm", or "bert_qa"
    """
    print(f"\n🔎 User Query: {query}")
    print(f"⚙️ Using model: {model_choice.upper()}")

    # --- Input Filter ---
    blocked, msg = input_filter(query)
    if blocked:
        print(msg)
        return msg

    # --- Retrieve from FAISS ---
    retrieved_docs = search_faiss(query, k=2)
    context = "\n".join(retrieved_docs)

    # --- Answer with Gemini ---
    if model_choice == "gemini":
        if not os.environ["GOOGLE_API_KEY"]:
            return "❌ Gemini API key not found!"
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
        response = model.generate_content(prompt).text

    # --- Answer with MiniLM (local fallback) ---
    elif model_choice == "minilm":
        # simple extractive answer: return most relevant chunk
        response = context

    # --- Answer with BERT QA ---
    elif model_choice == "bert_qa":
        # BERT QA logic will go here in the next step
        response = "BERT QA placeholder" # Placeholder response

    else:
        return "❌ Invalid model choice. Use 'gemini', 'minilm', or 'bert_qa'."

    # --- Output Filter ---
    safe_response = output_filter(response)

    print("\n📝 Raw Response:", response)
    print("\n🛡️ Redacted Response:", safe_response)

    return safe_response

In [None]:
def rag_pipeline(query, model_choice="gemini"):
    """
    Run RAG with chosen model.
    model_choice = "gemini", "minilm", or "bert_qa"
    """
    print(f"\n🔎 User Query: {query}")
    print(f"⚙️ Using model: {model_choice.upper()}")

    # --- Input Filter ---
    blocked, msg = input_filter(query)
    if blocked:
        print(msg)
        return msg

    # --- Retrieve from FAISS ---
    retrieved_docs = search_faiss(query, k=2)
    context = "\n".join(retrieved_docs)

    # --- Answer with Gemini ---
    if model_choice == "gemini":
        if not os.environ["GOOGLE_API_KEY"]:
            return "❌ Gemini API key not found!"
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
        response = model.generate_content(prompt).text

    # --- Answer with MiniLM (local fallback) ---
    elif model_choice == "minilm":
        # simple extractive answer: return most relevant chunk
        response = context

    # --- Answer with BERT QA ---
    elif model_choice == "bert_qa":
        # Tokenize the query and context
        inputs = bert_tokenizer(query, context, return_tensors="pt", truncation=True)

        # Get predictions from the BERT model
        outputs = bert_model(**inputs)

        # Extract the start and end token indices with the highest prediction scores
        answer_start_index = outputs.start_logits.argmax()
        answer_end_index = outputs.end_logits.argmax()

        # Convert the start and end token indices back to the corresponding text span
        answer_tokens = inputs.input_ids[0, answer_start_index:answer_end_index + 1]
        response = bert_tokenizer.decode(answer_tokens)

        # Handle cases where BERT might not find an answer
        if response.startswith("[CLS]") or response.endswith("[SEP]"):
             response = "Could not find a specific answer in the provided context."


    else:
        return "❌ Invalid model choice. Use 'gemini', 'minilm', or 'bert_qa'."

    # --- Output Filter ---
    safe_response = output_filter(response)

    print("\n📝 Raw Response:", response)
    print("\n🛡️ Redacted Response:", safe_response)

    return safe_response

In [None]:
user_input = input("Enter your query (and optionally, model: gemini, minilm, bert_qa, or mistral): ")

# Parse user input for query and model choice
parts = user_input.split(" model:")
user_query = parts[0].strip()
if len(parts) > 1:
    model_choice = parts[1].strip().lower()
else:
    model_choice = "gemini" # Default to gemini

print(f"Parsed Query: {user_query}")
print(f"Selected Model: {model_choice}")


# --- Try with the selected model ---
ans = rag_pipeline(user_query, model_choice=model_choice)

print("\n--- RAG Pipeline Results ---")
print(f"\n{model_choice.capitalize()} Response (Redacted):")
print(ans)

Enter your query (and optionally, model: gemini, minilm, bert_qa, or mistral): which patient has bronchitis, model:bert_qa
Parsed Query: which patient has bronchitis,
Selected Model: bert_qa

🔎 User Query: which patient has bronchitis,
⚙️ Using model: BERT_QA

📝 Raw Response: harinakshi raju

🛡️ Redacted Response: [REDACTED_PERSON] raju

--- RAG Pipeline Results ---

Bert_qa Response (Redacted):
[REDACTED_PERSON] raju


In [None]:
# Test with BERT QA
user_query_bert = "What medicine is prescribed for Bronchitis? model: bert_qa"
ans_bert = rag_pipeline(user_query_bert, model_choice="bert_qa")

print("\n--- Comparison ---")
print("\nBERT QA Response:")
print(ans_bert)

# Test with Gemini
user_query_gemini = "What medicine is prescribed for Bronchitis? model: gemini"
ans_gemini = rag_pipeline(user_query_gemini, model_choice="gemini")

print("\nGemini Response:")
print(ans_gemini)

# Test with MiniLM
user_query_minilm = "What medicine is prescribed for Bronchitis? model: minilm"
ans_minilm = rag_pipeline(user_query_minilm, model_choice="minilm")

print("\nMiniLM Response:")
print(ans_minilm)


🔎 User Query: What medicine is prescribed for Bronchitis? model: bert_qa
⚙️ Using model: BERT_QA

📝 Raw Response: perferendis

🛡️ Redacted Response: perferendis

--- Comparison ---

BERT QA Response:
perferendis

🔎 User Query: What medicine is prescribed for Bronchitis? model: gemini
⚙️ Using model: GEMINI

📝 Raw Response: Labore, Ex, and Qui are prescribed for Bronchitis.


🛡️ Redacted Response: Labore, Ex, and [REDACTED_PERSON] are prescribed for [REDACTED_GPE].


Gemini Response:
Labore, Ex, and [REDACTED_PERSON] are prescribed for [REDACTED_GPE].


🔎 User Query: What medicine is prescribed for Bronchitis? model: minilm
⚙️ Using model: MINILM

📝 Raw Response: Patient Name: Harinakshi Raju
        Patient ID: PID74882
        DOB: 1958-01-10
        Diagnosis: Bronchitis
        Prescriptions: [{'medicine': 'Labore', 'dosage': '1 tablets 1 times a day'}, {'medicine': 'Ex', 'dosage': '1 tablets 2 times a day'}]
        Lab Reports: [{'test': 'X-Ray', 'date': '2025-02-17', 'result': '