In [2]:
import pandas as pd
import requests
import logging
import re
from collections import defaultdict
import time

logging.basicConfig(level=logging.WARNING, format='%(message)s')
logger = logging.getLogger(__name__)


class EnhancedDiseaseScraper:
    """
    Multi-source disease scraper:
    - DOID (diseases + hierarchy)
    - HPO + HPOA (symptoms)
    - Orphanet (rare diseases)
    - OMIM (genetic diseases + symptoms)
    - DisGeNET (gene-disease associations + phenotypes)
    """

    def __init__(self, min_category_size=15):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'Mozilla/5.0'})
        self.diseases = {}
        self.disease_symptoms = defaultdict(set)
        self.min_category_size = min_category_size

    # ==================== DOID ====================

    def load_disease_ontology(self):
        """Load DOID"""
        urls = [
            "https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo",
            "https://raw.githubusercontent.com/teamUBUNTU/HumanDiseaseOntology/main/src/ontology/doid.obo"
        ]

        for url in urls:
            try:
                response = self.session.get(url, timeout=120)
                if response.status_code == 200:
                    self._parse_doid(response.text)
                    return True
            except:
                continue
        return False

    def _parse_doid(self, content):
        """Parse DOID OBO format"""
        current = {}

        for line in content.split('\n'):
            line = line.strip()

            if line == '[Term]':
                if self._is_valid_disease(current):
                    self.diseases[current['id']] = {
                        'name': current['name'],
                        'parents': current.get('parents', []),
                        'children': [],
                        'synonyms': current.get('synonyms', []),
                        'definition': current.get('definition', ''),
                        'xrefs': current.get('xrefs', [])
                    }
                current = {'parents': [], 'synonyms': [], 'xrefs': [], 'obsolete': False}

            elif line.startswith('id: '):
                current['id'] = line[4:].strip()
            elif line.startswith('name: '):
                current['name'] = line[6:].strip()
            elif line.startswith('def: '):
                match = re.search(r'"([^"]+)"', line)
                if match:
                    current['definition'] = match.group(1)
            elif line.startswith('is_a: '):
                current['parents'].append(line[6:].split('!')[0].strip())
            elif line.startswith('synonym: '):
                match = re.search(r'"([^"]+)"', line)
                if match:
                    current['synonyms'].append(match.group(1))
            elif line.startswith('xref: '):
                current['xrefs'].append(line[6:].strip())
            elif line.startswith('is_obsolete: true'):
                current['obsolete'] = True

        if self._is_valid_disease(current):
            self.diseases[current['id']] = {
                'name': current['name'],
                'parents': current.get('parents', []),
                'children': [],
                'synonyms': current.get('synonyms', []),
                'definition': current.get('definition', ''),
                'xrefs': current.get('xrefs', [])
            }

        # Build hierarchy
        for did, data in self.diseases.items():
            for parent in data['parents']:
                if parent in self.diseases:
                    self.diseases[parent]['children'].append(did)

    def _is_valid_disease(self, current):
        return (
            current.get('id') and
            current.get('name') and
            not current.get('obsolete') and
            'DOID' in current.get('id', '')
        )

    # ==================== HPO + HPOA ====================

    def load_hpo_symptoms(self):
        """Load HPO ontology and annotations"""
        print("  Loading HPO...")

        # Load HPO names
        hpo_url = "https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/hp.obo"
        try:
            response = self.session.get(hpo_url, timeout=120)
            if response.status_code == 200:
                hpo_names = self._parse_hpo_names(response.text)
            else:
                return False
        except:
            return False

        # Load HPOA annotations
        hpoa_url = "http://purl.obolibrary.org/obo/hp/hpoa/phenotype.hpoa"
        try:
            response = self.session.get(hpoa_url, timeout=120)
            if response.status_code == 200:
                self._parse_hpoa(response.text, hpo_names)
                return True
        except:
            pass

        return False

    def _parse_hpo_names(self, content):
        """Parse HPO OBO for term names"""
        hpo_names = {}
        current_id = None
        current_name = None

        for line in content.split('\n'):
            line = line.strip()
            if line == '[Term]':
                if current_id and current_name:
                    hpo_names[current_id] = current_name
                current_id = None
                current_name = None
            elif line.startswith('id: HP:'):
                current_id = line[4:].strip()
            elif line.startswith('name: '):
                current_name = line[6:].strip()

        if current_id and current_name:
            hpo_names[current_id] = current_name

        return hpo_names

    def _parse_hpoa(self, content, hpo_names):
        """Parse HPOA annotations"""
        for line in content.split('\n'):
            if line.startswith('#') or not line.strip():
                continue

            parts = line.split('\t')
            if len(parts) >= 4:
                disease_name = parts[1].strip().lower()
                hpo_id = parts[3].strip()

                if hpo_id in hpo_names:
                    symptom = hpo_names[hpo_id]
                    if self._is_valid_symptom(symptom):
                        self.disease_symptoms[disease_name].add(symptom)

    # ==================== ORPHANET ====================

    def load_orphanet(self):
        """Load Orphanet rare diseases"""
        print("  Loading Orphanet...")

        try:
            url = "http://www.orphadata.org/data/xml/en_product1.xml"
            response = self.session.get(url, timeout=120)

            if response.status_code != 200:
                return False

            import xml.etree.ElementTree as ET
            root = ET.fromstring(response.content)

            for disorder in root.findall('.//Disorder'):
                name_elem = disorder.find('.//Name')
                if name_elem is not None:
                    disease_name = name_elem.text.lower()

                    for sign in disorder.findall('.//HPODisorderAssociation'):
                        hpo_term = sign.find('.//HPOTerm')
                        if hpo_term is not None and hpo_term.text:
                            symptom = hpo_term.text
                            if self._is_valid_symptom(symptom):
                                self.disease_symptoms[disease_name].add(symptom)

            return True
        except:
            return False

    # ==================== OMIM ====================

    def load_omim_via_hpo(self):
        """Load OMIM diseases via HPO cross-references"""
        print("  Loading OMIM mappings...")

        omim_count = 0
        for did, data in self.diseases.items():
            for xref in data.get('xrefs', []):
                if xref.startswith('OMIM:'):
                    # OMIM diseases often have genetic symptoms
                    # Extract from definition if available
                    definition = data.get('definition', '')
                    if definition:
                        symptoms = self._extract_symptoms_from_text(definition)
                        if symptoms:
                            name = data['name'].lower()
                            self.disease_symptoms[name].update(symptoms)
                            omim_count += 1

        print(f"    Added symptoms for {omim_count} OMIM diseases")
        return True

    # ==================== DISGENET ====================

    def load_disgenet(self):
        """Load DisGeNET disease-gene-symptom associations"""
        print("  Loading DisGeNET...")

        try:
            # DisGeNET disease mappings
            url = "https://www.disgenet.org/static/disgenet_ap1/files/downloads/curated_gene_disease_associations.tsv.gz"

            response = self.session.get(url, timeout=180, stream=True)

            if response.status_code != 200:
                print("    DisGeNET not available (requires authentication)")
                return False

            # Note: DisGeNET requires registration for downloads
            # Using alternative approach via disease descriptions

        except:
            pass

        return False

    # ==================== SYMPTOM EXTRACTION ====================

    def extract_definition_symptoms(self):
        """Extract symptoms from disease definitions"""
        print("  Extracting from definitions...")

        patterns = [
            r'symptoms?\s+(?:may\s+)?include\s+([^.;]+)',
            r'presents?\s+with\s+([^.;]+)',
            r'characterized\s+by\s+([^.;]+)',
            r'manifests?\s+(?:with|as)\s+([^.;]+)',
            r'features?\s+include\s+([^.;]+)',
            r'clinical\s+features?\s+(?:include|are)\s+([^.;]+)',
            r'signs?\s+(?:and\s+symptoms?\s+)?include\s+([^.;]+)',
        ]

        for did, data in self.diseases.items():
            definition = data.get('definition', '')
            if not definition:
                continue

            symptoms = self._extract_symptoms_from_text(definition)
            if symptoms:
                name = data['name'].lower()
                self.disease_symptoms[name].update(symptoms)

    def _extract_symptoms_from_text(self, text):
        """Extract symptoms from any text"""
        patterns = [
            r'symptoms?\s+(?:may\s+)?include\s+([^.;]+)',
            r'presents?\s+with\s+([^.;]+)',
            r'characterized\s+by\s+([^.;]+)',
            r'manifests?\s+(?:with|as)\s+([^.;]+)',
            r'features?\s+include\s+([^.;]+)',
        ]

        symptoms = set()

        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                parts = re.split(r'[,;]\s*|\s+and\s+', match)
                for part in parts[:10]:
                    symptom = self._clean_symptom(part.strip())
                    if symptom and self._is_valid_extracted_symptom(symptom):
                        symptoms.add(symptom)

        return symptoms

    def _is_valid_symptom(self, symptom):
        """Validate symptom (relaxed)"""
        if not symptom or len(symptom) < 3:
            return False

        reject = ['syndrome', 'disorder type', 'malformation of', 'deficiency disease']
        return not any(term in symptom.lower() for term in reject)

    def _is_valid_extracted_symptom(self, symptom):
        """Validate extracted symptom"""
        if not symptom or len(symptom) < 4 or len(symptom) > 120:
            return False

        reject_starts = ['which', 'that is', 'it is', 'also known', 'refers to', 'is a type']
        return not any(symptom.lower().startswith(start) for start in reject_starts)

    def _clean_symptom(self, symptom):
        """Clean symptom text"""
        symptom = re.sub(r'^(the|a|an)\s+', '', symptom, flags=re.IGNORECASE)
        symptom = re.sub(r'\s+(and|or|with)$', '', symptom, flags=re.IGNORECASE)
        symptom = ' '.join(symptom.split())

        if symptom and len(symptom) > 0:
            symptom = symptom[0].upper() + symptom[1:]

        return symptom.strip()

    # ==================== ENHANCED CATEGORY DETECTION ====================

    def find_categories(self):
        """Enhanced category detection to match MalaCards"""
        print("  Finding categories...")

        categories = {}

        # Detect all category candidates
        for did, data in self.diseases.items():
            child_count = len(data['children'])
            parent_count = len(data['parents'])

            if child_count >= self.min_category_size and parent_count <= 3:
                if data['name'].lower() != 'disease':
                    total_descendants = self._count_descendants(did)

                    categories[did] = {
                        'name': data['name'],
                        'clean_name': self._clean_category_name(data['name']),
                        'type': self._classify_category(data['name']),
                        'count': total_descendants,
                        'id': did
                    }

        # Sort by count
        categories = dict(sorted(categories.items(), key=lambda x: x[1]['count'], reverse=True))

        return categories

    def _classify_category(self, name):
        """Classify as Global or Anatomical"""
        name_lower = name.lower()

        # Global indicators
        global_keywords = [
            'cancer', 'carcinoma', 'tumor', 'neoplasm', 'malignant',
            'genetic', 'inherited', 'hereditary', 'congenital',
            'infectious', 'bacterial', 'viral', 'fungal', 'parasitic',
            'metabolic', 'syndrome', 'rare', 'fetal', 'pregnancy',
            'autoimmune', 'chromosomal', 'developmental'
        ]

        # Anatomical indicators
        anatomical_keywords = [
            'system', 'organ', 'blood', 'bone', 'muscle', 'skin',
            'cardiovascular', 'respiratory', 'digestive', 'gastrointestinal',
            'nervous', 'neurological', 'brain', 'mental',
            'endocrine', 'immune', 'reproductive', 'urinary', 'kidney',
            'liver', 'eye', 'ear', 'oral', 'dental', 'sensory'
        ]

        is_global = any(kw in name_lower for kw in global_keywords)
        is_anatomical = any(kw in name_lower for kw in anatomical_keywords)

        if is_global:
            return 'Global'
        elif is_anatomical:
            return 'Anatomical'
        else:
            # Default: use parent count heuristic
            return 'Anatomical'

    def _clean_category_name(self, name):
        """Clean category name"""
        name = name.lower()

        # Remove disease-related suffixes
        name = re.sub(r'\s+disease$', ' diseases', name)
        name = re.sub(r'^disease of\s+', '', name)
        name = re.sub(r'\s+system disease', ' diseases', name)

        # Special mappings
        mappings = {
            'hematopoietic': 'Blood',
            'integumentary': 'Skin',
            'urinary system': 'Nephrological',
            'nervous system': 'Neuronal',
            'musculoskeletal system': 'Bone',
        }

        for old, new in mappings.items():
            if old in name:
                name = name.replace(old, new)

        # Capitalize
        name = ' '.join(word.capitalize() for word in name.split())

        if not name.endswith(('diseases', 'Diseases')):
            name += ' diseases'

        return name

    def _count_descendants(self, node_id, visited=None):
        """Count all descendants recursively"""
        if visited is None:
            visited = set()
        if node_id in visited or node_id not in self.diseases:
            return 0

        visited.add(node_id)
        count = 1

        for child in self.diseases[node_id]['children']:
            count += self._count_descendants(child, visited)

        return count

    # ==================== DISEASE EXTRACTION ====================

    def get_descendants(self, node_id, visited=None):
        """Get all disease descendants"""
        if visited is None:
            visited = set()
        if node_id in visited or node_id not in self.diseases:
            return []

        visited.add(node_id)
        results = [{
            'id': node_id,
            'name': self.diseases[node_id]['name'],
            'synonyms': self.diseases[node_id].get('synonyms', [])
        }]

        for child_id in self.diseases[node_id]['children']:
            results.extend(self.get_descendants(child_id, visited))

        return results

    def extract_diseases(self, categories):
        """Extract all diseases by category"""
        print("  Extracting diseases...")

        all_diseases = []
        seen = set()

        for cat_id, cat_info in categories.items():
            descendants = self.get_descendants(cat_id)

            for disease in descendants:
                key = disease['name'].lower()
                if key not in seen:
                    seen.add(key)
                    all_diseases.append({
                        'disease_name': disease['name'],
                        'category': cat_info['clean_name'],
                        'category_type': cat_info['type'],
                        'synonyms': disease['synonyms']
                    })

        return all_diseases

    # ==================== SYMPTOM MATCHING ====================

    def match_symptoms(self, diseases):
        """Match symptoms to diseases with multiple strategies"""
        print("  Matching symptoms...")

        matched = 0

        for disease in diseases:
            name = disease['disease_name'].lower()
            synonyms = [s.lower() for s in disease.get('synonyms', [])]

            symptoms = set()

            # Strategy 1: Exact match
            if name in self.disease_symptoms:
                symptoms.update(self.disease_symptoms[name])

            # Strategy 2: Synonym match
            for syn in synonyms:
                if syn in self.disease_symptoms:
                    symptoms.update(self.disease_symptoms[syn])

            # Strategy 3: Partial match (substring)
            if not symptoms:
                for known_disease, known_symptoms in self.disease_symptoms.items():
                    if name in known_disease or known_disease in name:
                        symptoms.update(known_symptoms)
                        break

            # Strategy 4: Word overlap (for compound names)
            if not symptoms:
                disease_words = set(name.split())
                disease_words -= {'disease', 'syndrome', 'disorder', 'of', 'the', 'and', 'type', 'with'}

                if len(disease_words) >= 2:
                    best_match = None
                    best_score = 0

                    for known_disease, known_symptoms in self.disease_symptoms.items():
                        known_words = set(known_disease.split())
                        overlap = len(disease_words & known_words)

                        if overlap > best_score and overlap >= 2:
                            best_score = overlap
                            best_match = known_symptoms

                    if best_match:
                        symptoms.update(best_match)

            if symptoms:
                matched += 1

            clean = self._clean_final_symptoms(symptoms)
            disease['symptoms'] = '; '.join(sorted(clean))

        print(f"    Matched {matched}/{len(diseases)} ({100*matched/len(diseases):.1f}%)")
        return diseases

    def _clean_final_symptoms(self, symptoms):
        """Final symptom cleaning"""
        clean = []

        for s in symptoms:
            if not s or len(s) < 3 or len(s) > 100:
                continue

            # Skip IDs and URLs
            skip = ['HP:', 'OMIM:', 'DOID:', 'http:', 'www.']
            if any(s.upper().startswith(prefix.upper()) for prefix in skip):
                continue

            # Skip if too many numbers
            if sum(c.isdigit() for c in s) > len(s) * 0.4:
                continue

            clean.append(s)

        return list(set(clean))

    # ==================== MAIN ====================

    def run(self):
        """Main execution"""
        print("=" * 70)
        print("ENHANCED DISEASE SCRAPER (Multi-Source)")
        print("=" * 70)
        print("\nLoading data sources...")

        # Load DOID
        print("  Loading DOID...")
        if not self.load_disease_ontology():
            print("Error: Could not load DOID")
            return None
        print(f"    Loaded {len(self.diseases)} diseases")

        # Load HPO + HPOA
        self.load_hpo_symptoms()

        # Load Orphanet
        self.load_orphanet()

        # Load OMIM mappings
        self.load_omim_via_hpo()

        # Extract from definitions
        self.extract_definition_symptoms()

        print(f"\n  Total symptom database: {len(self.disease_symptoms)} diseases")

        # Find categories
        categories = self.find_categories()
        print(f"  Found {len(categories)} categories")

        # Extract diseases
        diseases = self.extract_diseases(categories)

        # Match symptoms
        diseases = self.match_symptoms(diseases)

        # Create DataFrame
        df = pd.DataFrame(diseases)
        df = df[['disease_name', 'category', 'category_type', 'symptoms']]
        df = df.sort_values(['category_type', 'category', 'disease_name']).reset_index(drop=True)

        # Save
        output = 'diseases_enhanced.csv'
        df.to_csv(output, index=False)

        # Summary
        total = len(df)
        with_symptoms = (df['symptoms'] != '').sum()

        print("\n" + "=" * 70)
        print("SUMMARY")
        print("=" * 70)
        print(f"Total diseases: {total:,}")
        print(f"Categories: {df['category'].nunique()}")
        print(f"  • Global: {len(df[df['category_type']=='Global']['category'].unique())}")
        print(f"  • Anatomical: {len(df[df['category_type']=='Anatomical']['category'].unique())}")
        print(f"With symptoms: {with_symptoms:,} ({100*with_symptoms/total:.1f}%)")
        print(f"Without symptoms: {total-with_symptoms:,} ({100*(total-with_symptoms)/total:.1f}%)")

        print("\n" + "-" * 40)
        print("CATEGORIES:")
        print("-" * 40)
        print(df.groupby(['category_type', 'category']).size().to_string())

        print(f"\n{output} saved!")
        print("=" * 70)

        return df


def main():
    scraper = EnhancedDiseaseScraper(min_category_size=15)
    df = scraper.run()
    return df


if __name__ == "__main__":
    df = main()

ENHANCED DISEASE SCRAPER (Multi-Source)

Loading data sources...
  Loading DOID...
    Loaded 12011 diseases
  Loading HPO...
  Loading Orphanet...
  Loading OMIM mappings...
    Added symptoms for 0 OMIM diseases
  Extracting from definitions...

  Total symptom database: 15681 diseases
  Finding categories...
  Found 165 categories
  Extracting diseases...
  Matching symptoms...
    Matched 9479/11320 (83.7%)

SUMMARY
Total diseases: 11,320
Categories: 36
  • Global: 13
  • Anatomical: 23
With symptoms: 9,332 (82.4%)
Without symptoms: 1,988 (17.6%)

----------------------------------------
CATEGORIES:
----------------------------------------
category_type  category                                   
Anatomical     Agnosia diseases                                 32
               Allergic Diseases                               125
               Blood Coagulation Diseases                       34
               Connective Tissue Diseases                      405
               Digeni