<a href="https://colab.research.google.com/github/iserveradmin/samentic/blob/main/Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers faiss-cpu scikit-learn gradio pandas gemini_model



In [None]:
import pandas as pd
import re

# Create a diverse list of Q&A about finding doctors in Dwarka, Delhi, and other Indian cities
qa_data = [
    {"question": "I need a doctor for sugar patient in Dwarka tomorrow",
     "Answer": "You can consult Dr. A (Endocrinologist, Dwarka) or Dr. B (Diabetologist, Dwarka Sector-6) for diabetes care."},

    {"question": "Best dentist available in Dwarka today?",
     "Answer": "Dr. Mehta Dental Clinic in Dwarka Sector-12 is available today for dental care."},

    {"question": "Who is the best pediatrician in South Delhi?",
     "Answer": "Dr. Sharma at Apollo Cradle, South Delhi, is a highly recommended pediatrician."},

    {"question": "Where can I find a gynecologist in Saket, Delhi?",
     "Answer": "You can consult Dr. Priya at Max Super Speciality Hospital, Saket."},

    {"question": "I have stomach pain, which doctor should I consult?",
     "Answer": "You should consult a gastroenterologist for stomach pain issues."},

    {"question": "Need orthopedic doctor in Noida Sector 18",
     "Answer": "Dr. R.K. Verma, Orthopedic Specialist, is available in Noida Sector 18."},

    {"question": "Suggest a cardiologist in Gurgaon near Cyber City",
     "Answer": "You may consult Dr. Kapoor, Senior Cardiologist at Fortis Hospital, Gurgaon."},

    {"question": "Looking for a skin specialist in Dwarka",
     "Answer": "Dr. Pooja, Dermatologist, runs a clinic in Dwarka Sector-10."},

    {"question": "Who is a good neurologist in AIIMS Delhi?",
     "Answer": "AIIMS Delhi has Dr. Anil Kumar, an experienced neurologist for brain and nerve-related issues."},

    {"question": "ENT doctor available in Rohini today?",
     "Answer": "Dr. Sinha, ENT Specialist, is available at Rohini Sector-9 clinic today."},
]

# Expand the dataset programmatically to reach ~100 entries
cities = ["Dwarka", "Saket", "Noida", "Gurgaon", "Rohini", "South Delhi", "East Delhi", "West Delhi", "Mumbai", "Bangalore"]
specialties = [
    ("diabetes", "Endocrinologist/Diabetologist"),
    ("heart issues", "Cardiologist"),
    ("stomach pain", "Gastroenterologist"),
    ("skin problems", "Dermatologist"),
    ("children health", "Pediatrician"),
    ("bones pain", "Orthopedic"),
    ("ear pain", "ENT Specialist"),
    ("mental health", "Psychiatrist"),
    ("eyes checkup", "Ophthalmologist"),
    ("fever and cold", "General Physician"),
]

for city in cities:
    for condition, doctor in specialties:
        qa_data.append({
            "question": f"I need a doctor for {condition} in {city}",
            "Answer": f"You should consult a {doctor} in {city} for {condition} treatment."
        })

# Ensure we have at least ~100 entries
qa_data = qa_data[:100]

# Create DataFrame
df = pd.DataFrame(qa_data)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing to questions
df['question'] = df['question'].apply(preprocess_text)

display(df.head())

Unnamed: 0,question,Answer
0,i need a doctor for sugar patient in dwarka to...,"You can consult Dr. A (Endocrinologist, Dwarka..."
1,best dentist available in dwarka today,Dr. Mehta Dental Clinic in Dwarka Sector-12 is...
2,who is the best pediatrician in south delhi,"Dr. Sharma at Apollo Cradle, South Delhi, is a..."
3,where can i find a gynecologist in saket delhi,You can consult Dr. Priya at Max Super Special...
4,i have stomach pain which doctor should i consult,You should consult a gastroenterologist for st...


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re

# Load a pre-trained sentence transformer model
# You can try different models here, e.g., 'all-mpnet-base-v2' for potentially better results
model = SentenceTransformer('all-MiniLM-L6-v2')

# Preprocessing function (duplicate for clarity, ideally in a shared utility)
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Generate embeddings for the preprocessed questions
question_embeddings = model.encode(df['question'].tolist())

# Get the dimensionality of the embeddings
dimension = question_embeddings.shape[1]

# Create a FAISS index (using a simple IndexFlatL2 for demonstration)
index = faiss.IndexFlatL2(dimension)

# Add the embeddings to the index
index.add(question_embeddings)

print(f"Number of embeddings in the index: {index.ntotal}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Number of embeddings in the index: 100


In [None]:
# Define a query question
query_question = "stomach pain doctor near me"

# Generate embedding for the query question
query_embedding = model.encode([preprocess_text(query_question)]) # Preprocess the query

# Perform a similarity search (e.g., find the 3 most similar questions)
k = 3  # Number of nearest neighbors to retrieve
distances, indices = index.search(query_embedding, k)

# Retrieve the corresponding answers from the original DataFrame
print(f"Query: {query_question}")
print("\nBest matching questions and their answers:")
for i in range(k):
    print(f"Answer: {df['Answer'].iloc[indices[0][i]]}")
    print(f"Distance: {distances[0][i]:.4f}")
    print("-" * 20)

Query: stomach pain doctor near me

Best matching questions and their answers:
Answer: You should consult a gastroenterologist for stomach pain issues.
Distance: 0.4036
--------------------
Answer: You should consult a Gastroenterologist in Saket for stomach pain treatment.
Distance: 0.4591
--------------------
Answer: You should consult a Gastroenterologist in West Delhi for stomach pain treatment.
Distance: 0.4969
--------------------


Now, let's use the retrieved answers from the previous search to generate a response using the LLM. We'll create a prompt that includes the original query and the top answers from your similarity search, instructing the LLM to use only this information.

In [23]:
# Import the Python SDK
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

# Ensure the GOOGLE_API_KEY is set up in Colab secrets
try:
    GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
except Exception as e:
    print(f"Error setting up Gemini API key: {e}")
    print("Please ensure you have set up the GOOGLE_API_KEY correctly in Colab secrets.")
    # Exit the cell execution if API key setup fails
    raise

# Initialize the Gemini API
# You can choose a different model if needed, e.g., 'gemini-pro'
try:
    gemini_model = genai.GenerativeModel('gemini-2.5-flash')
except Exception as e:
    print(f"Error initializing Gemini model: {e}")
    # Exit the cell execution if model initialization fails
    raise


# Get the original query and the top answers from the previous search
# Assuming 'query_question', 'indices', and 'k' are available from the previous search cell (3a6d003c)
query = query_question
retrieved_answers = [df['Answer'].iloc[indices[0][i]] for i in range(k)]

# Craft a prompt for the LLM, emphasizing the use of only the provided context
prompt = f"""
Based ONLY on the following query and retrieved information, provide a helpful and conversational response.
Do NOT include any information not present in the provided context.

Query: "{query}"

Retrieved Information:
{'- ' + '\\n- '.join(retrieved_answers)}

Please synthesize this information and provide a clear answer to the user's query, strictly using the provided context.
"""

# Generate a response using the LLM
try:
    response = gemini_model.generate_content(prompt)
    print("LLM's Response (based only on provided context):")
    print(response.text)
except Exception as e:
    print(f"An error occurred while generating the LLM response: {e}")
    print("Please ensure you have set up the GOOGLE_API_KEY correctly in Colab secrets and have run the cells to initialize the model.")

LLM's Response (based only on provided context):
For stomach pain issues, you should consult a gastroenterologist. You can look for a Gastroenterologist in Saket or West Delhi for treatment.


In [22]:
# # Import the Python SDK
# import google.generativeai as genai

# import pprint
# for model in genai.list_models():
#     pprint.pprint(model)

Model(name='models/embedding-gecko-001',
      base_model_id='',
      version='001',
      display_name='Embedding Gecko',
      description='Obtain a distributed representation of a text.',
      input_token_limit=1024,
      output_token_limit=1,
      supported_generation_methods=['embedText', 'countTextTokens'],
      temperature=None,
      max_temperature=None,
      top_p=None,
      top_k=None)
Model(name='models/gemini-2.5-pro-preview-03-25',
      base_model_id='',
      version='2.5-preview-03-25',
      display_name='Gemini 2.5 Pro Preview 03-25',
      description='Gemini 2.5 Pro Preview 03-25',
      input_token_limit=1048576,
      output_token_limit=65536,
      supported_generation_methods=['generateContent',
                                    'countTokens',
                                    'createCachedContent',
                                    'batchGenerateContent'],
      temperature=1.0,
      max_temperature=2.0,
      top_p=0.95,
      top_k=64)
Model(na

The embeddings of your questions are now stored in the `index` object using FAISS. You can now use this index to perform similarity searches.