In [5]:
import pandas as pd
from collections import defaultdict

# Load data
df = pd.read_csv('ns_disorders_map.csv', low_memory=False)

# Split into ID and features
eid = df[['eid']]
feature_cols = df.drop(columns=['eid'])

# === YOUR PREFERRED TWO-LEVEL MAPPING (with Epilepsy_dx to avoid conflict) ===
DISEASE_SUBGROUPS = {
    # === CNS Infection & Inflammation ===
    "Bacterial meningitis, not elsewhere classified": ("CNS_infection_inflammation", "Meningitis"),
    "Meningitis in bacterial diseases classified elsewhere": ("CNS_infection_inflammation", "Meningitis"),
    "Meningitis in other infectious and parasitic diseases classified elsewhere": ("CNS_infection_inflammation", "Meningitis"),
    "Meningitis due to other and unspecified causes": ("CNS_infection_inflammation", "Meningitis"),
    "Encephalitis, myelitis and encephalomyelitis": ("CNS_infection_inflammation", "Encephalitis"),
    "Encephalitis, myelitis and encephalomyelitis in diseases classified elsewhere": ("CNS_infection_inflammation", "Encephalitis"),
    "Intracranial and intraspinal abscess and granuloma": ("CNS_infection_inflammation", "CNS_abscess"),
    "Intracranial and intraspinal abscess and granuloma in diseases classified elsewhere": ("CNS_infection_inflammation", "CNS_abscess"),
    "Intracranial and intraspinal phlebitis and thrombophlebitis": ("CNS_infection_inflammation", "CNS_thrombophlebitis"),
    "Sequelae of inflammatory diseases of central nervous system": ("CNS_infection_inflammation", "CNS_infection_sequelae"),
    
    # === Neurodegenerative Disorders ===
    "Alzheimer's disease": ("Neurodegenerative_disorder", "Alzheimer"),
    "Parkinson's disease": ("Neurodegenerative_disorder", "Parkinson"),
    "Huntington's disease": ("Neurodegenerative_disorder", "Huntington"),
    "Hereditary ataxia": ("Neurodegenerative_disorder", "Ataxia"),
    "Spinal muscular atrophy and related syndromes": ("Neurodegenerative_disorder", "SMA"),
    "Systemic atrophies primarily affecting central nervous system in diseases classified elsewhere": ("Neurodegenerative_disorder", "Neurodegenerative_other"),
    "Postpolio syndrome": ("Neurodegenerative_disorder", "Postpolio"),
    "Secondary parkinsonism": ("Neurodegenerative_disorder", "Parkinsonism"),
    "Parkinsonism in diseases classified elsewhere": ("Neurodegenerative_disorder", "Parkinsonism"),
    "Other degenerative diseases of basal ganglia": ("Neurodegenerative_disorder", "Basal_ganglia_degen"),
    "Dystonia": ("Neurodegenerative_disorder", "Dystonia"),
    "Other extrapyramidal and movement disorders": ("Neurodegenerative_disorder", "Movement_disorder"),
    "Other degenerative diseases of nervous system, not elsewhere classified": ("Neurodegenerative_disorder", "Neurodegenerative_other"),
    "Other degenerative disorders of nervous system in diseases classified elsewhere": ("Neurodegenerative_disorder", "Neurodegenerative_other"),
    
    # === Demyelinating Diseases ===
    "Multiple sclerosis": ("Demyelinating_disease", "MS"),
    "Other acute disseminated demyelination": ("Demyelinating_disease", "ADEM"),
    "Other demyelinating diseases of central nervous system": ("Demyelinating_disease", "Demyelinating_other"),
    
    # === Epilepsy ===
    "Epilepsy": ("Epilepsy", "Epilepsy_dx"),  # ✅ RENAMED to avoid conflict with main group
    "Status epilepticus": ("Epilepsy", "Status_epilepticus"),
    
    # === Headache Disorders ===
    "Migraine": ("Headache_disorder", "Migraine"),
    "Other headache syndromes": ("Headache_disorder", "Other_headache"),
    
    # === Cerebrovascular Disease ===
    "Transient cerebral ischaemic attacks and related syndromes": ("Cerebrovascular_disease", "TIA"),
    "Vascular syndromes of brain in cerebrovascular diseases": ("Cerebrovascular_disease", "Stroke_syndrome"),
    
    # === Peripheral Nervous System ===
    "Sleep disorders": ("Peripheral_nervous_system_disorder", "Sleep_disorder"),
    "Disorders of trigeminal nerve": ("Peripheral_nervous_system_disorder", "Trigeminal_neuralgia"),
    "Facial nerve disorders": ("Peripheral_nervous_system_disorder", "Facial_palsy"),
    "Disorders of other cranial nerves": ("Peripheral_nervous_system_disorder", "Cranial_neuropathy"),
    "Nerve root and plexus disorders": ("Peripheral_nervous_system_disorder", "Radiculopathy"),
    "Mononeuropathies of upper limb": ("Peripheral_nervous_system_disorder", "Upper_mono"),
    "Mononeuropathies of lower limb": ("Peripheral_nervous_system_disorder", "Lower_mono"),
    "Hereditary and idiopathic neuropathy": ("Peripheral_nervous_system_disorder", "Hereditary_neuropathy"),
    "Inflammatory polyneuropathy": ("Peripheral_nervous_system_disorder", "Guillain_Barre"),
    "Other polyneuropathies": ("Peripheral_nervous_system_disorder", "Polyneuropathy"),
    "Myasthenia gravis and other myoneural disorders": ("Peripheral_nervous_system_disorder", "Myasthenia_gravis"),
    "Primary disorders of muscles": ("Peripheral_nervous_system_disorder", "Muscle_disorder"),
    
    # === Paralytic & Cerebral Palsy ===
    "Infantile cerebral palsy": ("Paralytic_or_cerebral_palsy_syndrome", "Cerebral_palsy"),
    "Hemiplegia": ("Paralytic_or_cerebral_palsy_syndrome", "Hemiplegia"),
    "Paraplegia and tetraplegia": ("Paralytic_or_cerebral_palsy_syndrome", "Paraplegia"),
    "Other paralytic syndromes": ("Paralytic_or_cerebral_palsy_syndrome", "Paralytic_other"),
    
    # === Other CNS Disorders ===
    "Hydrocephalus": ("Other_CNS_disorder", "Hydrocephalus"),
    "Toxic encephalopathy": ("Other_CNS_disorder", "Toxic_encephalopathy"),
    "Disorders of autonomic nervous system": ("Other_CNS_disorder", "Autonomic_disorder"),
    "Other disorders of brain": ("Other_CNS_disorder", "Brain_disorder"),
    "Other diseases of spinal cord": ("Other_CNS_disorder", "Spinal_cord_disorder"),
    "Other disorders of central nervous system": ("Other_CNS_disorder", "CNS_other"),
}

# === FULL CLINICAL GROUP MAPPING (includes all "other" categories) ===
FULL_CLINICAL_GROUPS = {
    # CNS Infection & Inflammation
    "Bacterial meningitis, not elsewhere classified": "CNS_infection_inflammation",
    "Meningitis in bacterial diseases classified elsewhere": "CNS_infection_inflammation",
    "Meningitis in other infectious and parasitic diseases classified elsewhere": "CNS_infection_inflammation",
    "Meningitis due to other and unspecified causes": "CNS_infection_inflammation",
    "Encephalitis, myelitis and encephalomyelitis": "CNS_infection_inflammation",
    "Encephalitis, myelitis and encephalomyelitis in diseases classified elsewhere": "CNS_infection_inflammation",
    "Intracranial and intraspinal abscess and granuloma": "CNS_infection_inflammation",
    "Intracranial and intraspinal abscess and granuloma in diseases classified elsewhere": "CNS_infection_inflammation",
    "Intracranial and intraspinal phlebitis and thrombophlebitis": "CNS_infection_inflammation",
    "Sequelae of inflammatory diseases of central nervous system": "CNS_infection_inflammation",
    
    # Neurodegenerative Disorders
    "Huntington's disease": "Neurodegenerative_disorder",
    "Hereditary ataxia": "Neurodegenerative_disorder",
    "Spinal muscular atrophy and related syndromes": "Neurodegenerative_disorder",
    "Systemic atrophies primarily affecting central nervous system in diseases classified elsewhere": "Neurodegenerative_disorder",
    "Postpolio syndrome": "Neurodegenerative_disorder",
    "Parkinson's disease": "Neurodegenerative_disorder",
    "Secondary parkinsonism": "Neurodegenerative_disorder",
    "Parkinsonism in diseases classified elsewhere": "Neurodegenerative_disorder",
    "Other degenerative diseases of basal ganglia": "Neurodegenerative_disorder",
    "Dystonia": "Neurodegenerative_disorder",
    "Other extrapyramidal and movement disorders": "Neurodegenerative_disorder",
    "Alzheimer's disease": "Neurodegenerative_disorder",
    "Other degenerative diseases of nervous system, not elsewhere classified": "Neurodegenerative_disorder",
    "Other degenerative disorders of nervous system in diseases classified elsewhere": "Neurodegenerative_disorder",
    
    # Demyelinating Diseases
    "Multiple sclerosis": "Demyelinating_disease",
    "Other acute disseminated demyelination": "Demyelinating_disease",
    "Other demyelinating diseases of central nervous system": "Demyelinating_disease",
    
    # Epilepsy
    "Epilepsy": "Epilepsy",
    "Status epilepticus": "Epilepsy",
    
    # Headache Disorders
    "Migraine": "Headache_disorder",
    "Other headache syndromes": "Headache_disorder",
    
    # Cerebrovascular Disease
    "Transient cerebral ischaemic attacks and related syndromes": "Cerebrovascular_disease",
    "Vascular syndromes of brain in cerebrovascular diseases": "Cerebrovascular_disease",
    
    # Peripheral Nervous System Disorders
    "Sleep disorders": "Peripheral_nervous_system_disorder",
    "Disorders of trigeminal nerve": "Peripheral_nervous_system_disorder",
    "Facial nerve disorders": "Peripheral_nervous_system_disorder",
    "Disorders of other cranial nerves": "Peripheral_nervous_system_disorder",
    "Cranial nerve disorders in diseases classified elsewhere": "Peripheral_nervous_system_disorder",
    "Nerve root and plexus disorders": "Peripheral_nervous_system_disorder",
    "Nerve root and plexus compressions in diseases classified elsewhere": "Peripheral_nervous_system_disorder",
    "Mononeuropathies of upper limb": "Peripheral_nervous_system_disorder",
    "Mononeuropathies of lower limb": "Peripheral_nervous_system_disorder",
    "Other mononeuropathies": "Peripheral_nervous_system_disorder",
    "Mononeuropathy in diseases classified elsewhere": "Peripheral_nervous_system_disorder",
    "Hereditary and idiopathic neuropathy": "Peripheral_nervous_system_disorder",
    "Inflammatory polyneuropathy": "Peripheral_nervous_system_disorder",
    "Other polyneuropathies": "Peripheral_nervous_system_disorder",
    "Polyneuropathy in diseases classified elsewhere": "Peripheral_nervous_system_disorder",
    "Other disorders of peripheral nervous system": "Peripheral_nervous_system_disorder",
    "Myasthenia gravis and other myoneural disorders": "Peripheral_nervous_system_disorder",
    "Primary disorders of muscles": "Peripheral_nervous_system_disorder",
    "Other myopathies": "Peripheral_nervous_system_disorder",
    "Disorders of myoneural junction and muscle in diseases classified elsewhere": "Peripheral_nervous_system_disorder",
    
    # Paralytic & Cerebral Palsy Syndromes
    "Infantile cerebral palsy": "Paralytic_or_cerebral_palsy_syndrome",
    "Hemiplegia": "Paralytic_or_cerebral_palsy_syndrome",
    "Paraplegia and tetraplegia": "Paralytic_or_cerebral_palsy_syndrome",
    "Other paralytic syndromes": "Paralytic_or_cerebral_palsy_syndrome",
    
    # Other CNS Disorders
    "Disorders of autonomic nervous system": "Other_CNS_disorder",
    "Hydrocephalus": "Other_CNS_disorder",
    "Toxic encephalopathy": "Other_CNS_disorder",
    "Other disorders of brain": "Other_CNS_disorder",
    "Other disorders of brain in diseases classified elsewhere": "Other_CNS_disorder",
    "Other diseases of spinal cord": "Other_CNS_disorder",
    "Other disorders of central nervous system": "Other_CNS_disorder",
    "Postprocedural disorders of nervous system, not elsewhere classified": "Other_CNS_disorder",
    "Other disorders of nervous system, not elsewhere classified": "Other_CNS_disorder",
    "Other disorders of nervous system in diseases classified elsewhere": "Other_CNS_disorder",
}

# Helper: get base disease name (remove .1, .2 suffixes)
def get_base_name(col):
    if '.' in col and col.rsplit('.', 1)[-1].isdigit():
        return col.rsplit('.', 1)[0]
    return col

# === Group columns ===
clinical_group_cols = defaultdict(list)
subgroup_cols = {}

for col in feature_cols.columns:
    base = get_base_name(col)
    
    if base in FULL_CLINICAL_GROUPS:
        group = FULL_CLINICAL_GROUPS[base]
        clinical_group_cols[group].append(col)
        
        if base in DISEASE_SUBGROUPS:
            subgroup = DISEASE_SUBGROUPS[base][1]
            if subgroup not in subgroup_cols:
                subgroup_cols[subgroup] = []
            subgroup_cols[subgroup].append(col)
    else:
        print(f"⚠️ Warning: '{base}' not in any clinical group — skipping")

# === Build phenotype dictionary ===
pheno = {'eid': eid['eid'].values}

# Main clinical groups (9)
for group, cols in clinical_group_cols.items():
    pheno[group] = feature_cols[cols].notna().any(axis=1).astype(int)

# Subgroups (short names)
for subgroup, cols in subgroup_cols.items():
    pheno[subgroup] = feature_cols[cols].notna().any(axis=1).astype(int)

# Create DataFrame
pheno_df = pd.DataFrame(pheno)

# === Add 'healthy' flag ===
clinical_groups = [
    'CNS_infection_inflammation',
    'Neurodegenerative_disorder',
    'Demyelinating_disease',
    'Epilepsy',
    'Headache_disorder',
    'Cerebrovascular_disease',
    'Peripheral_nervous_system_disorder',
    'Paralytic_or_cerebral_palsy_syndrome',
    'Other_CNS_disorder'
]

# Ensure all 9 groups exist
for group in clinical_groups:
    if group not in pheno_df.columns:
        pheno_df[group] = 0

pheno_df['healthy'] = ~(pheno_df[clinical_groups].any(axis=1))

# === Final column order ===
final_columns = ['eid'] + clinical_groups + sorted(subgroup_cols.keys()) + ['healthy']
pheno_df = pheno_df[final_columns]

# Save
pheno_df.to_csv("ns_phenotypes.csv", index=False)

# === Print summary ===
print(f"✅ Final shape: {pheno_df.shape}")
print("\n📊 Prevalence per clinical group:")
for group in clinical_groups:
    pct = pheno_df[group].mean() * 100  # Now guaranteed to be a scalar Series
    print(f"  {group}: {pct:.2f}%")

print(f"\n🟢 Healthy (no condition): {pheno_df['healthy'].mean() * 100:.2f}%")

# Show top subgroups
print("\n🔍 Top subgroups (prevalence):")
subgroup_list = sorted(subgroup_cols.keys())
if subgroup_list:
    top_sub = pheno_df[subgroup_list].mean().sort_values(ascending=False).head(15) * 100
    for name, pct in top_sub.items():
        print(f"  {name:<25} {pct:>6.2f}%")

✅ Final shape: (501981, 58)

📊 Prevalence per clinical group:
  CNS_infection_inflammation: 0.86%
  Neurodegenerative_disorder: 3.98%
  Demyelinating_disease: 0.60%
  Epilepsy: 1.82%
  Headache_disorder: 6.60%
  Cerebrovascular_disease: 2.18%
  Peripheral_nervous_system_disorder: 14.46%
  Paralytic_or_cerebral_palsy_syndrome: 1.48%
  Other_CNS_disorder: 3.28%

🟢 Healthy (no condition): 72.46%

🔍 Top subgroups (prevalence):
  Migraine                    5.58%
  Upper_mono                  5.23%
  Sleep_disorder              4.21%
  TIA                         2.13%
  Epilepsy_dx                 1.80%
  Brain_disorder              1.49%
  Other_headache              1.23%
  Movement_disorder           1.21%
  Polyneuropathy              1.17%
  Lower_mono                  1.08%
  Hemiplegia                  1.03%
  Alzheimer                   0.96%
  Parkinson                   0.93%
  Neurodegenerative_other     0.77%
  Facial_palsy                0.77%


In [6]:
pheno_df.head()

Unnamed: 0,eid,CNS_infection_inflammation,Neurodegenerative_disorder,Demyelinating_disease,Epilepsy,Headache_disorder,Cerebrovascular_disease,Peripheral_nervous_system_disorder,Paralytic_or_cerebral_palsy_syndrome,Other_CNS_disorder,...,SMA,Sleep_disorder,Spinal_cord_disorder,Status_epilepticus,Stroke_syndrome,TIA,Toxic_encephalopathy,Trigeminal_neuralgia,Upper_mono,healthy
0,5574091,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
1,2695459,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
2,1909943,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
3,1832174,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
4,3028420,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
