In [6]:
import pandas as pd
from collections import defaultdict

df = pd.read_csv('Nervous_system_disorders_clean_names.csv', low_memory=False)

# Split into ID and features
eid = df[['eid']]
feature_cols = df.drop(columns=['eid'])

# Define mapping from disease name (base) to clinical group
CLINICAL_GROUPS = {
    # 1. CNS Infection & Inflammation
    "Bacterial meningitis, not elsewhere classified": "CNS_infection_inflammation",
    "Meningitis in bacterial diseases classified elsewhere": "CNS_infection_inflammation",
    "Meningitis in other infectious and parasitic diseases classified elsewhere": "CNS_infection_inflammation",
    "Meningitis due to other and unspecified causes": "CNS_infection_inflammation",
    "Encephalitis, myelitis and encephalomyelitis": "CNS_infection_inflammation",
    "Encephalitis, myelitis and encephalomyelitis in diseases classified elsewhere": "CNS_infection_inflammation",
    "Intracranial and intraspinal abscess and granuloma": "CNS_infection_inflammation",
    "Intracranial and intraspinal abscess and granuloma in diseases classified elsewhere": "CNS_infection_inflammation",
    "Intracranial and intraspinal phlebitis and thrombophlebitis": "CNS_infection_inflammation",
    "Sequelae of inflammatory diseases of central nervous system": "CNS_infection_inflammation",
    
    # 2. Neurodegenerative Disorders
    "Huntington's disease": "Neurodegenerative_disorder",
    "Hereditary ataxia": "Neurodegenerative_disorder",
    "Spinal muscular atrophy and related syndromes": "Neurodegenerative_disorder",
    "Systemic atrophies primarily affecting central nervous system in diseases classified elsewhere": "Neurodegenerative_disorder",
    "Postpolio syndrome": "Neurodegenerative_disorder",
    "Parkinson's disease": "Neurodegenerative_disorder",
    "Secondary parkinsonism": "Neurodegenerative_disorder",
    "Parkinsonism in diseases classified elsewhere": "Neurodegenerative_disorder",
    "Other degenerative diseases of basal ganglia": "Neurodegenerative_disorder",
    "Dystonia": "Neurodegenerative_disorder",
    "Other extrapyramidal and movement disorders": "Neurodegenerative_disorder",
    "Alzheimer's disease": "Neurodegenerative_disorder",
    "Other degenerative diseases of nervous system, not elsewhere classified": "Neurodegenerative_disorder",
    "Other degenerative disorders of nervous system in diseases classified elsewhere": "Neurodegenerative_disorder",
    
    # 3. Demyelinating Diseases
    "Multiple sclerosis": "Demyelinating_disease",
    "Other acute disseminated demyelination": "Demyelinating_disease",
    "Other demyelinating diseases of central nervous system": "Demyelinating_disease",
    
    # 4. Epilepsy
    "Epilepsy": "Epilepsy",
    "Status epilepticus": "Epilepsy",
    
    # 5. Headache Disorders
    "Migraine": "Headache_disorder",
    "Other headache syndromes": "Headache_disorder",
    
    # 6. Cerebrovascular Disease
    "Transient cerebral ischaemic attacks and related syndromes": "Cerebrovascular_disease",
    "Vascular syndromes of brain in cerebrovascular diseases": "Cerebrovascular_disease",
    
    # 7. Peripheral Nervous System Disorders
    "Sleep disorders": "Peripheral_nervous_system_disorder",
    "Disorders of trigeminal nerve": "Peripheral_nervous_system_disorder",
    "Facial nerve disorders": "Peripheral_nervous_system_disorder",
    "Disorders of other cranial nerves": "Peripheral_nervous_system_disorder",
    "Cranial nerve disorders in diseases classified elsewhere": "Peripheral_nervous_system_disorder",
    "Nerve root and plexus disorders": "Peripheral_nervous_system_disorder",
    "Nerve root and plexus compressions in diseases classified elsewhere": "Peripheral_nervous_system_disorder",
    "Mononeuropathies of upper limb": "Peripheral_nervous_system_disorder",
    "Mononeuropathies of lower limb": "Peripheral_nervous_system_disorder",
    "Other mononeuropathies": "Peripheral_nervous_system_disorder",
    "Mononeuropathy in diseases classified elsewhere": "Peripheral_nervous_system_disorder",
    "Hereditary and idiopathic neuropathy": "Peripheral_nervous_system_disorder",
    "Inflammatory polyneuropathy": "Peripheral_nervous_system_disorder",
    "Other polyneuropathies": "Peripheral_nervous_system_disorder",
    "Polyneuropathy in diseases classified elsewhere": "Peripheral_nervous_system_disorder",
    "Other disorders of peripheral nervous system": "Peripheral_nervous_system_disorder",
    "Myasthenia gravis and other myoneural disorders": "Peripheral_nervous_system_disorder",
    "Primary disorders of muscles": "Peripheral_nervous_system_disorder",
    "Other myopathies": "Peripheral_nervous_system_disorder",
    "Disorders of myoneural junction and muscle in diseases classified elsewhere": "Peripheral_nervous_system_disorder",
    
    # 8. Paralytic & Cerebral Palsy Syndromes
    "Infantile cerebral palsy": "Paralytic_or_cerebral_palsy_syndrome",
    "Hemiplegia": "Paralytic_or_cerebral_palsy_syndrome",
    "Paraplegia and tetraplegia": "Paralytic_or_cerebral_palsy_syndrome",
    "Other paralytic syndromes": "Paralytic_or_cerebral_palsy_syndrome",
    
    # 9. Other CNS Disorders
    "Disorders of autonomic nervous system": "Other_CNS_disorder",
    "Hydrocephalus": "Other_CNS_disorder",
    "Toxic encephalopathy": "Other_CNS_disorder",
    "Other disorders of brain": "Other_CNS_disorder",
    "Other disorders of brain in diseases classified elsewhere": "Other_CNS_disorder",
    "Other diseases of spinal cord": "Other_CNS_disorder",
    "Other disorders of central nervous system": "Other_CNS_disorder",
    "Postprocedural disorders of nervous system, not elsewhere classified": "Other_CNS_disorder",
    "Other disorders of nervous system, not elsewhere classified": "Other_CNS_disorder",
    "Other disorders of nervous system in diseases classified elsewhere": "Other_CNS_disorder",
}

# Helper: get base disease name (remove .1, .2, etc.)
def get_base_name(col):
    if '.' in col and col.rsplit('.', 1)[-1].isdigit():
        return col.rsplit('.', 1)[0]
    return col

# Group actual columns by clinical group
group_columns = defaultdict(list)
for col in feature_cols.columns:
    base = get_base_name(col)
    if base in CLINICAL_GROUPS:
        group = CLINICAL_GROUPS[base]
        group_columns[group].append(col)
    else:
        print(f"⚠️ Warning: '{base}' not in clinical grouping — skipping")

# Create final binary matrix
simplified = {'eid': eid['eid'].values}
for group, cols in group_columns.items():
    # If ANY column in the group has non-NaN → 1
    simplified[group] = feature_cols[cols].notna().any(axis=1).astype(int)

# Create DataFrame
pheno_clinical = pd.DataFrame(simplified)

# ✅ ADD 'healthy' COLUMN: True if NO condition is present
clinical_groups_list = [
    'CNS_infection_inflammation',
    'Neurodegenerative_disorder',
    'Demyelinating_disease',
    'Epilepsy',
    'Headache_disorder',
    'Cerebrovascular_disease',
    'Peripheral_nervous_system_disorder',
    'Paralytic_or_cerebral_palsy_syndrome',
    'Other_CNS_disorder'
]

# Ensure all groups are present (in case some were missing)
for group in clinical_groups_list:
    if group not in pheno_clinical.columns:
        pheno_clinical[group] = 0

# Healthy = no condition in any of the 9 groups
pheno_clinical['healthy'] = ~(pheno_clinical[clinical_groups_list].any(axis=1))

# Reorder to: eid + 9 groups + healthy → total 10 columns
final_columns = ['eid'] + clinical_groups_list + ['healthy']
pheno_clinical = pheno_clinical[final_columns]

# Save
pheno_clinical.to_csv("nervous_system_clinical_groups_binary.csv", index=False)

print(f"✅ Final shape: {pheno_clinical.shape}")
print("\nPrevalence per clinical group:")
prevalence = pheno_clinical[clinical_groups_list].mean() * 100
for group in clinical_groups_list:
    print(f"{group}: {prevalence[group]:.2f}%")

# Also show healthy prevalence
healthy_pct = pheno_clinical['healthy'].mean() * 100
print(f"\nHealthy (no condition): {healthy_pct:.2f}%")

✅ Final shape: (501981, 11)

Prevalence per clinical group:
CNS_infection_inflammation: 0.86%
Neurodegenerative_disorder: 3.98%
Demyelinating_disease: 0.60%
Epilepsy: 1.82%
Headache_disorder: 6.60%
Cerebrovascular_disease: 2.18%
Peripheral_nervous_system_disorder: 14.46%
Paralytic_or_cerebral_palsy_syndrome: 1.48%
Other_CNS_disorder: 3.28%

Healthy (no condition): 72.46%
