In [2]:
import pandas as pd
import requests
import json
import re
from pathlib import Path
from time import sleep

class DoctorMapper:
    def __init__(self, cache_file="/content/doctor_cache.json"):
        self.cache_file = cache_file
        self.cache = self._load_cache()
        self.session = requests.Session()
        self.session.headers.update({"User-Agent": "DoctorMapper/2.0"})
        self.api_calls = 0

    def _load_cache(self):
        if Path(self.cache_file).exists():
            try:
                return json.load(open(self.cache_file, 'r', encoding='utf-8'))
            except:
                return {}
        return {}

    def _save_cache(self):
        with open(self.cache_file, 'w', encoding='utf-8') as f:
            json.dump(self.cache, f, indent=2, ensure_ascii=False)

    def _clean_text(self, text):
        if not text or pd.isna(text):
            return ""
        text = str(text)
        text = re.sub(r'\[\[([^|]+\|)?([^\]]+)\]\]', r'\2', text)  # remove [[wiki]]
        text = re.sub(r'\(.*?\)', '', text)
        text = re.sub(r'\{\{[^}]+\}\}', '', text)
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def _standardize_specialty(self, text):
        text = self._clean_text(text)
        if not text:
            return None
        text = text.replace('&', 'and').lower()
        text = text.strip('. ')
        words = [w.capitalize() if w.lower() not in ['and','of','the'] else w.lower() for w in text.split()]
        return ' '.join(words)

    def _heuristic_specialty(self, disease):
        patterns = {
            'Cardiology': r'heart|cardiac|cardio|coronary|myocardial|angina|arrhythmia|hypertension',
            'Neurology': r'brain|neuro|stroke|seizure|epilepsy|parkinson|alzheimer|dementia',
            'Pulmonology': r'lung|respiratory|asthma|copd|pneumonia|bronchitis',
            'Gastroenterology': r'stomach|gastro|digestive|intestinal|colon|liver|hepatitis',
            'Endocrinology': r'diabetes|thyroid|hormone|insulin|metabolism',
            'Dermatology': r'skin|dermat|acne|eczema|psoriasis|rash|herpes',
            'Psychiatry': r'mental|psych|depression|anxiety|bipolar|schizo',
            'Oncology': r'cancer|tumor|carcinoma|sarcoma|leukemia',
            'Orthopedic Surgery': r'bone|joint|fracture|arthritis|spine',
            'Urology': r'kidney|urinary|bladder|prostate',
            'Obstetrics & Gynecology': r'pregnancy|gyneco|obstetric|uterine|ovarian|vaginal',
            'Infectious Disease': r'infection|viral|bacterial|fungal|hiv|aids|std|sexually transmitted',
        }
        d = disease.lower()
        for spec, pat in patterns.items():
            if re.search(pat, d):
                return spec
        return "General Medicine"

    def _get_wikipedia_specialty(self, disease):
        key = self._clean_text(disease).lower()
        if key in self.cache:
            return self.cache[key]

        try:
            url = "https://en.wikipedia.org/w/api.php"
            params = {"action": "query", "list": "search", "srsearch": disease, "format": "json", "srlimit": 1}
            res = self.session.get(url, params=params, timeout=10).json()
            results = res.get("query", {}).get("search", [])
            if not results:
                self.cache[key] = None
                return None

            page_title = results[0]["title"]
            sleep(0.3)  # polite rate limiting
            content = self.session.get(
                url,
                params={"action": "parse", "prop": "wikitext", "page": page_title, "format": "json"},
                timeout=10
            ).json()
            wikitext = content.get("parse", {}).get("wikitext", {}).get("*", "")
            match = re.search(r'\|\s*(specialty|field|medical_specialty)\s*=\s*([^\n|]+)', wikitext, re.I)
            specialty = self._standardize_specialty(match.group(2)) if match else None
            if specialty and "infectious" in specialty.lower():
                specialty = "Infectious Disease"
            self.cache[key] = specialty
            self.api_calls += 1
            if self.api_calls % 100 == 0:
                self._save_cache()
            return specialty
        except Exception:
            self.cache[key] = None
            return None

    def map_diseases_to_doctors(self, df, disease_col='disease'):
        unique = df[disease_col].dropna().unique()
        mapping = {}
        for i, disease in enumerate(unique):
            specialty = self._get_wikipedia_specialty(disease)
            if not specialty:
                specialty = self._heuristic_specialty(disease)
            mapping[disease] = specialty
            if (i+1) % 200 == 0:
                print(f"Processed {i+1}/{len(unique)} diseases...")
        self._save_cache()

        doctor_title_map = {
            'Cardiology': 'Cardiologist',
            'Neurology': 'Neurologist',
            'Pulmonology': 'Pulmonologist',
            'Gastroenterology': 'Gastroenterologist',
            'Endocrinology': 'Endocrinologist',
            'Dermatology': 'Dermatologist',
            'Psychiatry': 'Psychiatrist',
            'Oncology': 'Oncologist',
            'Orthopedic Surgery': 'Orthopedic Surgeon',
            'Urology': 'Urologist',
            'Obstetrics & Gynecology': 'Obstetrician / Gynecologist',
            'Infectious Disease': 'Infectious Diseases Specialist',
            'General Medicine': 'General Physician'
        }
        df['doctor'] = df[disease_col].map(lambda d: doctor_title_map.get(mapping.get(d), "General Physician"))
        return df

# === Paths ===
input_file = "/content/drive/MyDrive/symptom-disease ds/symptom-disease-final.csv"
output_file = "/content/drive/MyDrive/symptom-disease ds/symptom-disease-doctor-final.csv"

# === Execute ===
df = pd.read_csv(input_file, encoding='utf-8', on_bad_lines='skip')
mapper = DoctorMapper()
df = mapper.map_diseases_to_doctors(df, disease_col='disease')
df = df[['symptom', 'disease', 'doctor']]
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"✅ Done! Clean file saved to: {output_file}")


Processed 200/8688 diseases...
Processed 400/8688 diseases...
Processed 600/8688 diseases...
Processed 800/8688 diseases...
Processed 1000/8688 diseases...
Processed 1200/8688 diseases...
Processed 1400/8688 diseases...
Processed 1600/8688 diseases...
Processed 1800/8688 diseases...
Processed 2000/8688 diseases...
Processed 2200/8688 diseases...
Processed 2400/8688 diseases...
Processed 2600/8688 diseases...
Processed 2800/8688 diseases...
Processed 3000/8688 diseases...
Processed 3200/8688 diseases...
Processed 3400/8688 diseases...
Processed 3600/8688 diseases...
Processed 3800/8688 diseases...
Processed 4000/8688 diseases...
Processed 4200/8688 diseases...
Processed 4400/8688 diseases...
Processed 4600/8688 diseases...
Processed 4800/8688 diseases...
Processed 5000/8688 diseases...
Processed 5200/8688 diseases...
Processed 5400/8688 diseases...
Processed 5600/8688 diseases...
Processed 5800/8688 diseases...
Processed 6000/8688 diseases...
Processed 6200/8688 diseases...
Processed 64