In [31]:
# Importing Libraries
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [32]:
# Load Dataset
df = pd.read_csv('./data-collection/qa/medquad.csv')
df.dropna(inplace=True)  # Remove rows with any NaN values
df.reset_index(drop=True, inplace=True)  # Reset the index to ensure continuous indices
df


Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
...,...,...,...,...
16388,What is (are) Diabetic Neuropathies: The Nerve...,Focal neuropathy appears suddenly and affects ...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16389,How to prevent Diabetic Neuropathies: The Nerv...,The best way to prevent neuropathy is to keep ...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16390,How to diagnose Diabetic Neuropathies: The Ner...,Doctors diagnose neuropathy on the basis of sy...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16391,What are the treatments for Diabetic Neuropath...,The first treatment step is to bring blood glu...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...


In [33]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each answer in the dataset
answer_embeddings = model.encode(df['answer'].tolist(), convert_to_tensor=True)

# Save embeddings if needed, to avoid re-computation
np.save('answer_embeddings.npy', answer_embeddings.numpy()) 

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [34]:
def find_closest_answers(query, embeddings, answers, top_k=5):
    # Encode the query to the same space as your answers
    query_embedding = model.encode([query], convert_to_tensor=True)
    
    # Compute similarities
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get the top K answers with highest cosine similarity scores
    top_indices = np.argsort(similarities)[::-1][:top_k]
    return [(answers[i], similarities[i]) for i in top_indices]

In [None]:
# Example query
query = "What is glaucoma?"
closest_answers = find_closest_answers(query, answer_embeddings, df['answer'].tolist())
for answer, score in closest_answers:
    print(f"* {answer}")