# Import Libraries

In [22]:
import json
import torch
import numpy as np
from tqdm import tqdm
import faiss
from sentence_transformers import SentenceTransformer
import os

# Load Sentence Transformer

In [12]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Load NHS Data

In [13]:
def load_text_data(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    texts = []
    for item in data:
        # 将 Disease、Symptoms 和 Treatments 组合成一个文本块
        disease = item.get("Disease", "")
        symptoms = " ".join(item.get("Symptoms", []))
        treatments = " ".join(item.get("Treatments", []))
        combined_text = f"Disease: {disease}\nSymptoms: {symptoms}\nTreatments: {treatments}"
        texts.append(combined_text)
    return texts

# Get Embeddings in Batches

In [14]:
def get_batch_embeddings(texts, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Computing embeddings"):
        batch = texts[i : i + batch_size]
        batch_embeddings = model.encode(batch, convert_to_numpy=True)
        embeddings.extend(batch_embeddings)
    return np.array(embeddings, dtype="float32")

# Build FAISS Index using IVFFlat

In [15]:
def build_faiss_index(texts):
    print("\nComputing text embeddings...")
    embeddings = get_batch_embeddings(texts)

    print("\nBuilding FAISS index...")
    dimension = embeddings.shape[1]
    quantizer = faiss.IndexFlatL2(dimension)  # Used for clustering
    index = faiss.IndexIVFFlat(quantizer, dimension, 100)  # 100 clusters for faster search

    index.train(embeddings)  # Train FAISS with embeddings
    index.add(embeddings)  # Add embeddings to index
    
    return index, texts

# Save FAISS index and text data

In [16]:
def save_retrieval_system(index, texts, index_file, texts_file):
    print("\nSaving FAISS index and text data...")
    faiss.write_index(index, index_file)

    with open(texts_file, "w", encoding="utf-8") as f:
        json.dump(texts, f, ensure_ascii=False, indent=4)

    print("✅ FAISS retrieval system built and saved successfully!")

# Search FAISS

In [17]:
def search_faiss(query, index, texts, k=5):
    query_embedding = model.encode([query], convert_to_numpy=True).astype("float32")
    distances, indices = index.search(query_embedding, k)
    
    results = [(texts[i], distances[0][j]) for j, i in enumerate(indices[0])]
    return results

# Build and save

In [18]:
def build_and_save_text_retrieval_system(json_file, index_file, texts_file):
    texts = load_text_data(json_file)
    index, texts = build_faiss_index(texts)
    save_retrieval_system(index, texts, index_file, texts_file)

# MAIN

In [23]:
text_data_file = r'..\..\dataset\nhsInform\NHS_Data.json'
index_file = r'..\..\dataset\nhsInform\faiss_index.bin'
texts_file = r'..\..\dataset\nhsInform\texts.json'

build_and_save_text_retrieval_system(text_data_file, index_file, texts_file)


Computing text embeddings...


Computing embeddings: 100%|██████████| 26/26 [00:00<00:00, 37.25it/s]


Building FAISS index...

Saving FAISS index and text data...
✅ FAISS retrieval system built and saved successfully!





# TEST

In [24]:
index = faiss.read_index(index_file)
with open(texts_file, "r", encoding="utf-8") as f:
    texts = json.load(f)

query = "What are the symptoms of pneumonia?"
results = search_faiss(query, index, texts, k=3)

print("\n🔍 Search Results:")
for i, (text, score) in enumerate(results):
    print(f"{i+1}. {text} (Score: {score:.4f})")


🔍 Search Results:
1. Disease: Pneumonia
Symptoms: Your GP may be able to diagnose pneumonia by asking about your symptoms and examining your chest. Further tests may be needed in some cases. Pneumonia can be difficult to diagnose because it has similar symptoms to other conditions. For example the common cold , bronchitis and asthma . To help make a diagnosis, your GP may ask you: whether you feel breathless if you’re breathing faster than usual how long you’ve had your cough whether you’re coughing up mucus and if so, what colour it is if the pain in your chest is worse when you breathe in or out
Treatments: You may need treatment in hospital if your symptoms are severe. You may be given antibiotics and fluids intravenously through a drip, and you may need oxygen if your blood oxygen levels are low. In very serious cases of pneumonia, breathing assistance through a ventilator in a high dependency unit (HDU) or an intensive care unit (ICU) may be required. (Score: 0.7243)
2. Disease: 