In [1]:
import pandas as pd

# Load the specific dataset
filename = 'symptoms_with_remedies.xlsx'

try:
    df = pd.read_excel(filename)
    print("‚úÖ File loaded successfully!")
    
    # Check the column names to understand the text structure
    print("\n--- Column Names ---")
    print(df.columns.tolist())
    
    # Check the first few rows to understand the content
    print("\n--- First 3 Rows ---")
    print(df.head(3))

except FileNotFoundError:
    print(f"‚ùå Error: Could not find '{filename}'. Make sure it is in the same folder as your notebook.")

‚úÖ File loaded successfully!

--- Column Names ---
['Unnamed: 0', 'Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Remedies']

--- First 3 Rows ---
   Unnamed: 0           Disease   Symptom_1              Symptom_2  \
0           0  Fungal infection     itching              skin_rash   
1           1  Fungal infection   skin_rash   nodal_skin_eruptions   
2           2  Fungal infection     itching   nodal_skin_eruptions   

               Symptom_3             Symptom_4  \
0   nodal_skin_eruptions   dischromic _patches   
1    dischromic _patches                   NaN   
2    dischromic _patches                   NaN   

                                            Remedies  
0  Apply antifungal cream, keep area dry, avoid t...  
1  Apply antifungal cream, keep area dry, avoid t...  
2  Apply antifungal cream, keep area dry, avoid t...  


In [2]:
import pandas as pd
import numpy as np

# 1. CLEANING: Fill missing values with empty strings
df.fillna('', inplace=True)

# 2. NLP PRE-PROCESSING: Combine all symptoms into one text column for easier matching
# We create a 'Description' column that the NLP model will search against
df['Combined_Symptoms'] = df['Symptom_1'] + ", " + df['Symptom_2'] + ", " + df['Symptom_3'] + ", " + df['Symptom_4']

# 3. FEATURE ENGINEERING: Create the Missing Doctor Data
# We define a mapping logic: Disease Category -> Specialist
def assign_doctor(disease):
    disease = disease.lower()
    if 'heart' in disease or 'cardio' in disease:
        return {'Specialist': 'Cardiologist', 'Name': 'Dr. A. Sharma', 'Time': '10:00 AM - 2:00 PM', 'Location': 'City Heart Center, Delhi'}
    elif 'fungal' in disease or 'skin' in disease or 'rash' in disease:
        return {'Specialist': 'Dermatologist', 'Name': 'Dr. P. Verma', 'Time': '4:00 PM - 8:00 PM', 'Location': 'Skin Care Clinic, Mumbai'}
    elif 'stomach' in disease or 'digestion' in disease:
        return {'Specialist': 'Gastroenterologist', 'Name': 'Dr. R. Gupta', 'Time': '11:00 AM - 3:00 PM', 'Location': 'Digestive Care, Bangalore'}
    else:
        # Default General Physician for other diseases
        return {'Specialist': 'General Physician', 'Name': 'Dr. S. Kumar', 'Time': '9:00 AM - 5:00 PM', 'Location': 'City Hospital, Main Wing'}

# Apply this logic to create new columns
doctor_info = df['Disease'].apply(assign_doctor)
df['Doctor_Name'] = doctor_info.apply(lambda x: x['Name'])
df['Doctor_Specialist'] = doctor_info.apply(lambda x: x['Specialist'])
df['Doctor_Time'] = doctor_info.apply(lambda x: x['Time'])
df['Doctor_Location'] = doctor_info.apply(lambda x: x['Location'])

# Show the new enhanced dataset
print("‚úÖ Data Enriched with Doctor Details!")
print(df[['Disease', 'Combined_Symptoms', 'Doctor_Name', 'Doctor_Location']].head(3))

‚úÖ Data Enriched with Doctor Details!
            Disease                                  Combined_Symptoms  \
0  Fungal infection  itching,  skin_rash,  nodal_skin_eruptions,  d...   
1  Fungal infection   skin_rash,  nodal_skin_eruptions,  dischromic...   
2  Fungal infection  itching,  nodal_skin_eruptions,  dischromic _p...   

    Doctor_Name           Doctor_Location  
0  Dr. P. Verma  Skin Care Clinic, Mumbai  
1  Dr. P. Verma  Skin Care Clinic, Mumbai  
2  Dr. P. Verma  Skin Care Clinic, Mumbai  


In [4]:
# 1. Install the library for free embeddings
!pip install sentence-transformers

# 2. Import necessary libraries
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# 3. Initialize the Free Embedding Model
# This runs locally on your CPU (no API key needed)
print("Loading free embedding model... (This might take a minute)")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 4. Create the Vector Database
try:
    print("Creating Vector Database...")
    # 'documents' is the list we created in the previous step
    vector_db = FAISS.from_documents(documents, embeddings)
    print("‚úÖ Vector Database created successfully using HuggingFace!")
    print("The AI is now ready to search your data for free.")
except Exception as e:
    print(f"‚ùå Error: {e}")

Loading free embedding model... (This might take a minute)


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating Vector Database...
‚ùå Error: Could not import faiss python package. Please install it with `pip install faiss-gpu` (for CUDA supported GPU) or `pip install faiss-cpu` (depending on Python version).


In [1]:
# 1. Ensure FAISS is installed
!pip install faiss-cpu

# 2. Re-import everything (Necessary after restart)
import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document

# 3. Re-load your dataframe (Since memory was cleared)
df = pd.read_excel('symptoms_with_remedies.xlsx')
df.fillna('', inplace=True)
df['Combined_Symptoms'] = df['Symptom_1'] + ", " + df['Symptom_2'] + ", " + df['Symptom_3'] + ", " + df['Symptom_4']

# Re-create the doctor logic
def assign_doctor(disease):
    disease = disease.lower()
    if 'heart' in disease or 'cardio' in disease:
        return {'Specialist': 'Cardiologist', 'Name': 'Dr. A. Sharma', 'Time': '10:00 AM - 2:00 PM', 'Location': 'City Heart Center, Delhi'}
    elif 'fungal' in disease or 'skin' in disease or 'rash' in disease:
        return {'Specialist': 'Dermatologist', 'Name': 'Dr. P. Verma', 'Time': '4:00 PM - 8:00 PM', 'Location': 'Skin Care Clinic, Mumbai'}
    elif 'stomach' in disease or 'digestion' in disease:
        return {'Specialist': 'Gastroenterologist', 'Name': 'Dr. R. Gupta', 'Time': '11:00 AM - 3:00 PM', 'Location': 'Digestive Care, Bangalore'}
    else:
        return {'Specialist': 'General Physician', 'Name': 'Dr. S. Kumar', 'Time': '9:00 AM - 5:00 PM', 'Location': 'City Hospital, Main Wing'}

doctor_info = df['Disease'].apply(assign_doctor)
df['Doctor_Name'] = doctor_info.apply(lambda x: x['Name'])
df['Doctor_Specialist'] = doctor_info.apply(lambda x: x['Specialist'])
df['Doctor_Time'] = doctor_info.apply(lambda x: x['Time'])
df['Doctor_Location'] = doctor_info.apply(lambda x: x['Location'])

# 4. Prepare Documents again
documents = []
for index, row in df.iterrows():
    searchable_text = f"Disease: {row['Disease']}. Symptoms: {row['Combined_Symptoms']}"
    meta_data = {
        "disease": row['Disease'],
        "remedies": row['Remedies'],
        "doc_name": row['Doctor_Name'],
        "doc_spec": row['Doctor_Specialist'],
        "doc_time": row['Doctor_Time'],
        "doc_loc": row['Doctor_Location']
    }
    doc = Document(page_content=searchable_text, metadata=meta_data)
    documents.append(doc)

# 5. Create Vector DB
print("Loading Model & Creating Database...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(documents, embeddings)
print("‚úÖ Success! Vector Database is ready.")

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\ihars\\anaconda3\\Lib\\site-packages\\~umpy\\.libs\\libopenblas64__v0.3.21-gcc_10_3_0.dll'
Consider using the `--user` option or check the permissions.



Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-cp39-win_amd64.whl (18.7 MB)
     ---------------------------------------- 18.7/18.7 MB 8.6 MB/s eta 0:00:00
Collecting numpy<3.0,>=1.25.0
  Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl (15.9 MB)
Installing collected packages: numpy, faiss-cpu
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.4
    Uninstalling numpy-1.24.4:
      Successfully uninstalled numpy-1.24.4


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [2]:
!pip install "numpy<2.0.0" --force-reinstall

Collecting numpy<2.0.0
  Downloading numpy-1.26.4-cp39-cp39-win_amd64.whl (15.8 MB)
     --------------------------------------- 15.8/15.8 MB 10.7 MB/s eta 0:00:00
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\ihars\\anaconda3\\Lib\\site-packages\\~umpy.libs\\libscipy_openblas64_-caad452230ae4ddb57899b8b3a33c55c.dll'
Consider using the `--user` option or check the permissions.



In [1]:
import pandas as pd
import numpy as np
from langchain_community.embeddings import HuggingFaceEmbeddings
from sklearn.neighbors import NearestNeighbors

# 1. LOAD DATA (Standard steps)
df = pd.read_excel('symptoms_with_remedies.xlsx')
df.fillna('', inplace=True)
df['Combined_Symptoms'] = df['Symptom_1'] + ", " + df['Symptom_2'] + ", " + df['Symptom_3'] + ", " + df['Symptom_4']

# Re-apply Doctor Logic
def assign_doctor(disease):
    disease = disease.lower()
    if 'heart' in disease or 'cardio' in disease:
        return {'Specialist': 'Cardiologist', 'Name': 'Dr. A. Sharma', 'Time': '10:00 AM - 2:00 PM', 'Location': 'City Heart Center, Delhi'}
    elif 'fungal' in disease or 'skin' in disease or 'rash' in disease:
        return {'Specialist': 'Dermatologist', 'Name': 'Dr. P. Verma', 'Time': '4:00 PM - 8:00 PM', 'Location': 'Skin Care Clinic, Mumbai'}
    elif 'stomach' in disease or 'digestion' in disease:
        return {'Specialist': 'Gastroenterologist', 'Name': 'Dr. R. Gupta', 'Time': '11:00 AM - 3:00 PM', 'Location': 'Digestive Care, Bangalore'}
    else:
        return {'Specialist': 'General Physician', 'Name': 'Dr. S. Kumar', 'Time': '9:00 AM - 5:00 PM', 'Location': 'City Hospital, Main Wing'}

doctor_info = df['Disease'].apply(assign_doctor)
df['Doctor_Name'] = doctor_info.apply(lambda x: x['Name'])
df['Doctor_Specialist'] = doctor_info.apply(lambda x: x['Specialist'])
df['Doctor_Time'] = doctor_info.apply(lambda x: x['Time'])
df['Doctor_Location'] = doctor_info.apply(lambda x: x['Location'])

# 2. CREATE EMBEDDINGS (The "Brain")
print("Loading Embedding Model...")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print("Generating vectors for all diseases... (This takes a moment)")
# We convert all symptoms text into numbers
symptom_vectors = embedding_model.embed_documents(df['Combined_Symptoms'].tolist())

# 3. BUILD SEARCH ENGINE (Using Scikit-Learn)
print("Building Search Engine...")
# We use NearestNeighbors to find the closest matching vector
knn = NearestNeighbors(n_neighbors=1, metric='cosine')
knn.fit(symptom_vectors)

print("‚úÖ Success! The chatbot brain is ready.")

Loading Embedding Model...


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Generating vectors for all diseases... (This takes a moment)
Building Search Engine...
‚úÖ Success! The chatbot brain is ready.


In [None]:
from deep_translator import GoogleTranslator
import time

def find_best_match(user_query):
    # 1. Translate Hindi Query -> English (for the database)
    translator_en = GoogleTranslator(source='auto', target='en')
    query_en = translator_en.translate(user_query)
    
    # 2. Convert Query to Numbers (Vector)
    query_vector = embedding_model.embed_query(query_en)
    
    # 3. Find Closest Match
    # reshapes vector to match format expected by knn
    distances, indices = knn.kneighbors([query_vector])
    
    # Get the best matching row from our Data
    best_match_index = indices[0][0]
    result = df.iloc[best_match_index]
    
    return result

def chat_interface():
    print("ü§ñ Sehat Sathi is Ready! (Type 'quit' to stop)")
    print("------------------------------------------------")
    
    translator_hi = GoogleTranslator(source='auto', target='hi')
    
    while True:
        # Get User Input
        user_input = input("\nüë§ You (Hindi/English): ")
        
        if user_input.lower() in ['quit', 'exit', 'stop']:
            print("ü§ñ Sehat Sathi: ‡§Ö‡§™‡§®‡§æ ‡§ñ‡•ç‡§Ø‡§æ‡§≤ ‡§∞‡§ñ‡•á‡§Ç! (Take care!)")
            break
        
        print("ü§ñ Thinking...")
        
        try:
            # Find the best medical match
            match = find_best_match(user_input)
            
            # Construct the English Answer
            response_en = (
                f"Based on your symptoms, it seems like you have {match['Disease']}.\n\n"
                f"üíä Remedy: {match['Remedies']}\n\n"
                f"üë®‚Äç‚öïÔ∏è Suggested Doctor Appointment:\n"
                f"   - Name: {match['Doctor_Name']} ({match['Doctor_Specialist']})\n"
                f"   - Time: {match['Doctor_Time']}\n"
                f"   - Location: {match['Doctor_Location']}\n"
            )
            
            # Translate Answer to Hindi
            response_hi = translator_hi.translate(response_en)
            
            # Display Output
            print(f"ü§ñ Sehat Sathi: {response_hi}")
            
        except Exception as e:
            print(f"‚ùå Error: {e}")

# START THE CHAT
chat_interface()

ü§ñ Sehat Sathi is Ready! (Type 'quit' to stop)
------------------------------------------------

üë§ You (Hindi/English): ‡§Æ‡•Å‡§ù‡•á ‡§§‡•ç‡§µ‡§ö‡§æ ‡§™‡§∞ ‡§¨‡§π‡•Å‡§§ ‡§ñ‡•Å‡§ú‡§≤‡•Ä ‡§π‡•ã ‡§∞‡§π‡•Ä ‡§π‡•à
ü§ñ Thinking...
ü§ñ Sehat Sathi: ‡§Ü‡§™‡§ï‡•á ‡§≤‡§ï‡•ç‡§∑‡§£‡•ã‡§Ç ‡§ï‡•á ‡§Ü‡§ß‡§æ‡§∞ ‡§™‡§∞, ‡§ê‡§∏‡§æ ‡§≤‡§ó‡§§‡§æ ‡§π‡•à ‡§ï‡§ø ‡§Ü‡§™‡§ï‡•ã ‡§ö‡§ø‡§ï‡§® ‡§™‡•â‡§ï‡•ç‡§∏ ‡§π‡•à‡•§

üíä‡§â‡§™‡§æ‡§Ø: ‡§â‡§ö‡§ø‡§§ ‡§â‡§™‡§ö‡§æ‡§∞ ‡§ï‡•á ‡§≤‡§ø‡§è ‡§°‡•â‡§ï‡•ç‡§ü‡§∞ ‡§∏‡•á ‡§™‡§∞‡§æ‡§Æ‡§∞‡•ç‡§∂ ‡§≤‡•á‡§Ç‡•§

üë®‚Äç‚öïÔ∏è ‡§∏‡•Å‡§ù‡§æ‡§è ‡§ó‡§è ‡§°‡•â‡§ï‡•ç‡§ü‡§∞ ‡§ï‡•Ä ‡§®‡§ø‡§Ø‡•Å‡§ï‡•ç‡§§‡§ø:
   - ‡§®‡§æ‡§Æ: ‡§°‡•â. ‡§è‡§∏. ‡§ï‡•Å‡§Æ‡§æ‡§∞ (‡§ú‡§®‡§∞‡§≤ ‡§´‡§ø‡§ú‡§ø‡§∂‡§ø‡§Ø‡§®)
   - ‡§∏‡§Æ‡§Ø: ‡§∏‡•Å‡§¨‡§π 9:00 ‡§¨‡§ú‡•á ‡§∏‡•á ‡§∂‡§æ‡§Æ 5:00 ‡§¨‡§ú‡•á ‡§§‡§ï
   - ‡§∏‡•ç‡§•‡§æ‡§®: ‡§∏‡§ø‡§ü‡•Ä ‡§π‡•â‡§∏‡•ç‡§™‡§ø‡§ü‡§≤, ‡§Æ‡•á‡§® ‡§µ‡§ø‡§Ç‡§ó

üë§ You (Hindi/English): ‡§Æ‡•Å‡§ù‡•á ‡§∏‡•Ä‡§®‡•á ‡§Æ‡•á‡§Ç ‡§¶‡§∞‡•ç‡§¶ ‡§π‡•à
ü§ñ Thinking...
ü§ñ Sehat Sathi: ‡§Ü‡§™‡§ï‡

2026-01-05 14:32:01.303 Session state does not function when running a script without `streamlit run`
