Sleep Data Processing

In [1]:
pip install pyedflib 

Collecting pyedflib
  Downloading pyedflib-0.1.40.tar.gz (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: pyedflib
  Building wheel for pyedflib (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pyedflib: filename=pyedflib-0.1.40-cp311-cp311-macosx_10_9_x86_64.whl size=2308891 sha256=43f130e83ee258449083b098a4af878e80064efeda5f152e574cf593a4e4c203
  Stored in directory: /Users/mattikey/Library/Caches/pip/wheels/8d/df/d6/88ce619bde055ebffebae5380645802eca490817853b60b45b
Successfully built pyedflib
Installing collected packages: pyedflib
Successfully installed pyedflib-0.1.40
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import mne
import pyedflib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [15]:
# Add these functions to your notebook - these are the missing pieces!

def process_hypnogram_enhanced(edf_path, subject_data=None):
    """
    Enhanced hypnogram processing with subject data integration
    """
    try:
        # Try MNE first
        raw = mne.io.read_raw_edf(edf_path, preload=False, verbose=False)
        annotations = raw.annotations
        
        # Extract metadata from filename
        filename = Path(edf_path).name
        file_metadata = parse_st_filename(filename)
        
        # Find matching subject data
        subject_info = find_subject_info(file_metadata, subject_data)
        
        # Process sleep stages
        hypnogram_data = []
        
        for i, (onset, duration, description) in enumerate(zip(
            annotations.onset, annotations.duration, annotations.description)):
            
            stage = map_sleep_stage(description)
            
            hypnogram_data.append({
                'epoch': i,
                'onset_seconds': float(onset),
                'duration_seconds': float(duration),
                'onset_minutes': float(onset) / 60,
                'onset_hours': float(onset) / 3600,
                'stage': stage,
                'stage_numeric': stage_to_numeric(stage),
                'raw_description': description.strip()
            })
        
        # Calculate comprehensive statistics
        summary_stats = calculate_comprehensive_stats(hypnogram_data)
        
        # Recording metadata
        recording_info = {
            'duration_hours': (annotations.onset[-1] + annotations.duration[-1]) / 3600 if len(annotations) > 0 else 0,
            'total_epochs': len(hypnogram_data),
            'start_time': raw.info['meas_date'].isoformat() if raw.info['meas_date'] else None,
            'sampling_rate': raw.info['sfreq'],
            'epoch_length_seconds': 30  # Standard sleep scoring epoch
        }
        
        return {
            'metadata': {
                'filename': filename,
                **file_metadata,
                'subject_info': subject_info,
                'study_type': 'Sleep Telemetry',
                'recording_info': recording_info
            },
            'hypnogram': hypnogram_data,
            'summary_stats': summary_stats
        }
        
    except Exception as e:
        print(f"Error processing {edf_path}: {str(e)}")
        return None

def parse_st_filename(filename):
    """
    Parse ST filename: ST7011JP-Hypnogram.edf
    ST = Study Type
    70 = Subject ID  
    1 = Night
    1 = Condition (0=placebo, 1=temazepam typically)
    J = Additional identifier
    P = Additional identifier
    """
    base = filename.replace('-Hypnogram.edf', '').replace('-PSG.edf', '')
    
    if base.startswith('ST7'):
        # Extract components
        subject_id = base[3:5] if len(base) >= 5 else None
        night_id = base[5:6] if len(base) >= 6 else None
        condition_code = base[6:7] if len(base) >= 7 else None
        
        # Map condition code
        condition_map = {'0': 'placebo', '1': 'temazepam', 'P': 'placebo', 'J': 'temazepam'}
        condition = condition_map.get(condition_code, condition_code)
        
        return {
            'subject_id': subject_id,
            'night_id': night_id,
            'condition': condition,
            'condition_code': condition_code
        }
    
    return {'subject_id': None, 'night_id': None, 'condition': None}

def find_subject_info(file_metadata, subject_data):
    """
    Find matching subject information from demographics data
    """
    if not subject_data or not file_metadata.get('subject_id'):
        return {'age': None, 'gender': None}
    
    subject_id = file_metadata['subject_id']
    night_id = file_metadata['night_id']
    condition = file_metadata['condition']
    
    # Look for matching record
    for record in subject_data:
        record_subj = str(record.get('subject', '')).zfill(2)
        record_night = str(record.get('night', ''))
        record_condition = str(record.get('condition', '')).lower()
        
        if (record_subj == subject_id and 
            record_night == night_id and
            condition and record_condition in condition.lower()):
            
            return {
                'age': record.get('age'),
                'gender': record.get('gender'),
                'condition_verified': record.get('condition')
            }
    
    # If no exact match, try just subject ID
    for record in subject_data:
        record_subj = str(record.get('subject', '')).zfill(2)
        if record_subj == subject_id:
            return {
                'age': record.get('age'),
                'gender': record.get('gender'),
                'condition_verified': None
            }
    
    return {'age': None, 'gender': None}

def map_sleep_stage(description):
    """Enhanced sleep stage mapping"""
    desc = description.lower().strip()
    
    # Comprehensive mapping
    if 'sleep stage w' in desc or desc == 'w':
        return 'Wake'
    elif 'sleep stage r' in desc or desc == 'r':
        return 'REM'
    elif 'sleep stage 1' in desc or desc == '1':
        return 'N1'
    elif 'sleep stage 2' in desc or desc == '2':
        return 'N2'
    elif 'sleep stage 3' in desc or desc == '3':
        return 'N3'
    elif 'sleep stage 4' in desc or desc == '4':
        return 'N4'
    elif 'movement' in desc or desc == 'm':
        return 'Movement'
    elif '?' in desc:
        return 'Unknown'
    else:
        return description  # Keep original if no mapping found

def stage_to_numeric(stage):
    """Convert stage to numeric for analysis"""
    mapping = {
        'Wake': 0, 'N1': 1, 'N2': 2, 'N3': 3, 'N4': 4, 
        'REM': 5, 'Movement': 6, 'Unknown': -1
    }
    return mapping.get(stage, -1)

def calculate_comprehensive_stats(hypnogram_data):
    """Calculate comprehensive sleep statistics"""
    if not hypnogram_data:
        return {
            'sleep_efficiency': 0.0,
            'rem_percentage': 0.0,
            'deep_sleep_percentage': 0.0,
            'light_sleep_percentage': 0.0,
            'wake_percentage': 100.0,
            'stage_percentages': {},
            'sleep_onset_minutes': None,
            'rem_onset_minutes': None,
            'total_sleep_time_hours': 0.0,
            'wake_after_sleep_onset_minutes': 0.0
        }
    
    # Calculate durations
    total_duration = sum(epoch['duration_seconds'] for epoch in hypnogram_data)
    stage_durations = {}
    
    for epoch in hypnogram_data:
        stage = epoch['stage']
        duration = epoch['duration_seconds']
        stage_durations[stage] = stage_durations.get(stage, 0) + duration
    
    # Stage percentages
    stage_percentages = {
        stage: (duration / total_duration) * 100 
        for stage, duration in stage_durations.items()
    }
    
    # Key metrics
    wake_time = stage_durations.get('Wake', 0)
    rem_time = stage_durations.get('REM', 0)
    n1_time = stage_durations.get('N1', 0)
    n2_time = stage_durations.get('N2', 0)
    n3_time = stage_durations.get('N3', 0)
    n4_time = stage_durations.get('N4', 0)
    
    total_sleep_time = total_duration - wake_time
    
    # Find sleep onset (first non-wake stage)
    sleep_onset_minutes = None
    rem_onset_minutes = None
    
    for epoch in hypnogram_data:
        if epoch['stage'] != 'Wake' and sleep_onset_minutes is None:
            sleep_onset_minutes = epoch['onset_minutes']
        if epoch['stage'] == 'REM' and rem_onset_minutes is None:
            rem_onset_minutes = epoch['onset_minutes']
    
    return {
        'sleep_efficiency': round((total_sleep_time / total_duration) * 100, 2) if total_duration > 0 else 0,
        'rem_percentage': round((rem_time / total_duration) * 100, 2),
        'deep_sleep_percentage': round(((n3_time + n4_time) / total_duration) * 100, 2),
        'light_sleep_percentage': round(((n1_time + n2_time) / total_duration) * 100, 2),
        'wake_percentage': round((wake_time / total_duration) * 100, 2),
        'stage_percentages': {k: round(v, 2) for k, v in stage_percentages.items()},
        'sleep_onset_minutes': round(sleep_onset_minutes, 1) if sleep_onset_minutes else None,
        'rem_onset_minutes': round(rem_onset_minutes, 1) if rem_onset_minutes else None,
        'total_sleep_time_hours': round(total_sleep_time / 3600, 2),
        'wake_after_sleep_onset_minutes': round((wake_time - (sleep_onset_minutes * 60 if sleep_onset_minutes else 0)) / 60, 1) if sleep_onset_minutes else 0
    }

# Fix your ST_SUBJECTS_DATA - it needs proper formatting
ST_SUBJECTS_DATA = """
Subject age sex Placebo_night Temazepam_night Nr Age M1F2 night_nr lights_off night_nr lights_off
1 60 1 1 23:01 2 23:48
2 35 2 2 23:27 1 0:00
4 18 2 1 23:53 2 22:37
5 32 2 2 23:23 1 23:34
6 35 2 1 23:28 2 23:26
7 51 2 1 0:02 2 23:24
8 66 2 2 23:20 1 23:53
9 47 1 2 0:30 1 23:42
10 20 2 1 23:21 2 23:28
11 21 2 2 23:52 1 23:38
12 21 1 1 23:46 2 23:56
13 22 1 2 0:31 1 0:38
14 20 1 1 0:40 2 0:53
15 66 2 1 23:42 2 23:33
16 79 2 2 23:21 1 23:18
17 48 2 1 23:40 2 23:48
18 53 2 2 23:38 1 23:24
19 28 2 2 23:22 1 23:44
20 24 1 1 23:47 2 0:01
21 34 2 2 23:44 1 23:10
22 56 1 1 23:22 2 23:44
24 48 2 1 23:27 2 23:36
"""

In [16]:
# ============================================================================
# MAIN PROCESSING PIPELINE
# ============================================================================

def process_st_subjects_from_text(text_data):
    """
    Process ST subjects data from the provided text format
    """
    lines = text_data.strip().split('\n')
    
    subjects = []
    
    # Skip header line and process each subject
    for line in lines[1:]:  # Skip the header
        if not line.strip():
            continue
            
        parts = line.split()
        if len(parts) >= 8:
            subject_num = int(parts[0])
            age = int(parts[1])
            sex_code = int(parts[2])  # 1=Male, 2=Female
            gender = 'M' if sex_code == 1 else 'F'
            
            placebo_night = int(parts[3])
            placebo_time = parts[4]
            temazepam_night = int(parts[5]) 
            temazepam_time = parts[6]
            
            # Create records for both nights
            subjects.append({
                'subject': f"{subject_num:02d}",
                'age': age,
                'gender': gender,
                'night': placebo_night,
                'condition': 'placebo',
                'lights_off_time': placebo_time,
                'study': 'ST'
            })
            
            subjects.append({
                'subject': f"{subject_num:02d}",
                'age': age,
                'gender': gender,
                'night': temazepam_night,
                'condition': 'temazepam',
                'lights_off_time': temazepam_time,
                'study': 'ST'
            })
    
    print(f"Processed {len(subjects)} subject records from {len(lines)-1} subjects")
    return subjects

In [17]:
def create_dataset_summary(processed_data, subject_data):
    """
    Create comprehensive dataset summary for your project
    """
    summary = {
        'total_subjects': len(set(record['metadata']['subject_info'].get('age') for record in processed_data if record['metadata']['subject_info'].get('age'))),
        'total_recordings': len(processed_data),
        'conditions': {},
        'age_distribution': [],
        'gender_distribution': {},
        'sleep_stats_by_condition': {},
        'sleep_stats_by_age_group': {},
        'sleep_stats_by_gender': {}
    }
    
    # Group by conditions
    condition_groups = {}
    age_groups = {'18-30': [], '31-50': [], '51-70': [], '70+': []}
    gender_groups = {'M': [], 'F': []}
    
    for record in processed_data:
        if not record:
            continue
            
        condition = record['metadata'].get('condition', 'unknown')
        age = record['metadata']['subject_info'].get('age')
        gender = record['metadata']['subject_info'].get('gender')
        stats = record['summary_stats']
        
        # Condition grouping
        if condition not in condition_groups:
            condition_groups[condition] = []
        condition_groups[condition].append(stats)
        
        # Age grouping
        if age:
            if age <= 30:
                age_groups['18-30'].append(stats)
            elif age <= 50:
                age_groups['31-50'].append(stats)
            elif age <= 70:
                age_groups['51-70'].append(stats)
            else:
                age_groups['70+'].append(stats)
        
        # Gender grouping
        if gender in gender_groups:
            gender_groups[gender].append(stats)
    
    # Calculate averages for each group
    summary['conditions'] = {k: len(v) for k, v in condition_groups.items()}
    summary['sleep_stats_by_condition'] = {
        condition: calculate_group_averages(stats_list)
        for condition, stats_list in condition_groups.items()
    }
    
    summary['sleep_stats_by_age_group'] = {
        age_group: calculate_group_averages(stats_list)
        for age_group, stats_list in age_groups.items() if stats_list
    }
    
    summary['sleep_stats_by_gender'] = {
        gender: calculate_group_averages(stats_list)
        for gender, stats_list in gender_groups.items() if stats_list
    }
    
    # Age and gender distributions
    ages = [record['metadata']['subject_info'].get('age') for record in processed_data 
            if record and record['metadata']['subject_info'].get('age')]
    genders = [record['metadata']['subject_info'].get('gender') for record in processed_data 
               if record and record['metadata']['subject_info'].get('gender')]
    
    summary['age_distribution'] = ages
    summary['gender_distribution'] = {gender: genders.count(gender) for gender in set(genders)}
    
    return summary


In [18]:
def calculate_group_averages(stats_list):
    """Calculate average statistics for a group"""
    if not stats_list:
        return {}
    
    avg_stats = {}
    
    # Calculate averages for numeric fields
    numeric_fields = ['sleep_efficiency', 'rem_percentage', 'deep_sleep_percentage', 
                     'light_sleep_percentage', 'total_sleep_time_hours']
    
    for field in numeric_fields:
        values = [stats.get(field, 0) for stats in stats_list if stats.get(field) is not None]
        avg_stats[field] = round(sum(values) / len(values), 2) if values else 0
    
    # Calculate average stage percentages
    all_stages = set()
    for stats in stats_list:
        if 'stage_percentages' in stats:
            all_stages.update(stats['stage_percentages'].keys())
    
    avg_stage_percentages = {}
    for stage in all_stages:
        values = [stats['stage_percentages'].get(stage, 0) 
                 for stats in stats_list if 'stage_percentages' in stats]
        avg_stage_percentages[stage] = round(sum(values) / len(values), 2) if values else 0
    
    avg_stats['stage_percentages'] = avg_stage_percentages
    
    return avg_stats

In [19]:
def run_complete_processing(edf_data_dir, output_dir, st_subjects_text):
    """
    Run the complete processing pipeline
    """
    print("=== Starting Complete Sleep Data Processing ===")
    
    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # Process subject demographics
    print("\n1. Processing subject demographics...")
    subject_data = process_st_subjects_from_text(st_subjects_text)
    
    # Save subject data
    with open(output_path / "st_subjects_processed.json", 'w') as f:
        json.dump(subject_data, f, indent=2)
    
    # Process hypnogram files
    print("\n2. Processing hypnogram files...")
    data_path = Path(edf_data_dir)
    hypnogram_files = list(data_path.glob("*Hypnogram.edf"))
    
    print(f"Found {len(hypnogram_files)} hypnogram files")
    
    processed_data = []
    failed_files = []
    
    for file_path in hypnogram_files:
        print(f"Processing {file_path.name}...")
        
        result = process_hypnogram_enhanced(file_path, subject_data)
        
        if result:
            processed_data.append(result)
            
            # Save individual file
            output_file = output_path / f"{file_path.stem}.json"
            with open(output_file, 'w') as f:
                json.dump(result, f, indent=2)
        else:
            failed_files.append(file_path.name)
    
    print(f"\n3. Processing complete!")
    print(f"   Successfully processed: {len(processed_data)} files")
    print(f"   Failed to process: {len(failed_files)} files")
    
    if failed_files:
        print(f"   Failed files: {failed_files}")
    
    # Create dataset summary
    print("\n4. Creating dataset summary...")
    dataset_summary = create_dataset_summary(processed_data, subject_data)
    
    # Save all results
    with open(output_path / "combined_sleep_data.json", 'w') as f:
        json.dump(processed_data, f, indent=2)
    
    with open(output_path / "dataset_summary.json", 'w') as f:
        json.dump(dataset_summary, f, indent=2)
    
    # Print summary statistics
    print(f"\n=== PROCESSING SUMMARY ===")
    print(f"Total recordings processed: {len(processed_data)}")
    print(f"Subjects with demographics: {len(subject_data)}")
    print(f"Conditions found: {list(dataset_summary['conditions'].keys())}")
    print(f"Age range: {min(dataset_summary['age_distribution'])}-{max(dataset_summary['age_distribution'])} years")
    print(f"Gender distribution: {dataset_summary['gender_distribution']}")
    
    return processed_data, dataset_summary


In [22]:
# Your ST subjects data
ST_SUBJECTS_DATA = """
Subject - age - sex Placebo night Temazepam night Nr Age M1/F2 night nr lights off night nr lights off 1 60 1 1 23:01 2 23:48 2 35 2 2 23:27 1 0:00 4 18 2 1 23:53 2 22:37 5 32 2 2 23:23 1 23:34 6 35 2 1 23:28 2 23:26 7 51 2 1 0:02 2 23:24 8 66 2 2 23:20 1 23:53 9 47 1 2 0:30 1 23:42 10 20 2 1 23:21 2 23:28 11 21 2 2 23:52 1 23:38 12 21 1 1 23:46 2 23:56 13 22 1 2 0:31 1 0:38 14 20 1 1 0:40 2 0:53 15 66 2 1 23:42 2 23:33 16 79 2 2 23:21 1 23:18 17 48 2 1 23:40 2 23:48 18 53 2 2 23:38 1 23:24 19 28 2 2 23:22 1 23:44 20 24 1 1 23:47 2 0:01 21 34 2 2 23:44 1 23:10 22 56 1 1 23:22 2 23:44 24 48 2 1 23:27 2 23:36
"""

if __name__ == "__main__":
    # Run the complete processing
    edf_directory = "Data/raw-sleep-telemetry"  # Directory containing EDF files
    output_directory = "processed_sleep_data"
    
    processed_data, summary = run_complete_processing(
        edf_directory, 
        output_directory, 
        ST_SUBJECTS_DATA
    )
    
    print("\nProcessing complete! Check the output directory for:")
    print("- combined_sleep_data.json (all processed recordings)")
    print("- dataset_summary.json (statistics and summaries)")
    print("- st_subjects_processed.json (clean subject demographics)")
    print("- Individual JSON files for each recording")

# ============================================================================
# SUBJECT DEMOGRAPHICS PROCESSING
# ============================================================================

def process_st_subjects(file_path):
    """
    Process ST-subjects file (handles .xls, .xlsx, .csv)
    Returns cleaned subject demographics
    """
    file_path = Path(file_path)
    
    try:
        # Try different file formats
        if file_path.suffix in ['.xls', '.xlsx']:
            df = pd.read_excel(file_path)
        elif file_path.suffix == '.csv':
            df = pd.read_csv(file_path)
        else:
            print(f"Unsupported file format: {file_path.suffix}")
            return None
            
        print(f"Loaded subject data with columns: {list(df.columns)}")
        print(f"Shape: {df.shape}")
        print("\nFirst few rows:")
        print(df.head())
        
        # Clean and standardize the data
        # Common column name variations to look for
        column_mappings = {
            'subject': ['subject', 'subject_id', 'subj', 'id'],
            'age': ['age', 'age_years'],
            'gender': ['gender', 'sex', 'male_female', 'm_f'],
            'condition': ['condition', 'treatment', 'drug', 'medication'],
            'night': ['night', 'session', 'recording']
        }
        
        # Standardize column names
        df_clean = df.copy()
        for standard_name, variations in column_mappings.items():
            for col in df.columns:
                if col.lower().strip() in [v.lower() for v in variations]:
                    df_clean = df_clean.rename(columns={col: standard_name})
                    break
        
        return df_clean.to_dict('records')
        
    except Exception as e:
        print(f"Error processing subject file: {e}")
        print("Creating mock subject data based on known ST study structure...")
        return create_mock_st_subjects()

def create_mock_st_subjects():
    """
    Create mock subject data based on ST study documentation
    22 subjects, 2 nights each (placebo/temazepam)
    """
    subjects = []
    
    # Generate 22 subjects (ages 20-60, mixed gender)
    np.random.seed(42)  # For reproducible results
    
    for subj_num in range(1, 23):  # subjects 1-22
        age = np.random.randint(20, 61)
        gender = np.random.choice(['M', 'F'])
        
        # Each subject has 2 nights
        for night in [1, 2]:
            # Randomize which night gets temazepam vs placebo
            condition = 'temazepam' if (subj_num + night) % 2 == 0 else 'placebo'
            
            subjects.append({
                'subject': f"{subj_num:02d}",
                'age': age,
                'gender': gender,
                'night': night,
                'condition': condition,
                'study': 'ST'
            })
    
    print(f"Created mock data for {len(subjects)} recordings")
    return subjects



=== Starting Complete Sleep Data Processing ===

1. Processing subject demographics...
Processed 0 subject records from 0 subjects

2. Processing hypnogram files...
Found 40 hypnogram files
Processing ST7152JA-Hypnogram.edf...
Processing ST7071JA-Hypnogram.edf...
Processing ST7081JW-Hypnogram.edf...
Processing ST7042JO-Hypnogram.edf...
Processing ST7052JA-Hypnogram.edf...
Processing ST7221JA-Hypnogram.edf...
Processing ST7171JA-Hypnogram.edf...
Processing ST7112JE-Hypnogram.edf...
Processing ST7192JR-Hypnogram.edf...
Processing ST7062JR-Hypnogram.edf...
Processing ST7142JE-Hypnogram.edf...
Processing ST7121JE-Hypnogram.edf...
Processing ST7242JO-Hypnogram.edf...
Processing ST7211JJ-Hypnogram.edf...
Processing ST7202JO-Hypnogram.edf...
Processing ST7132JR-Hypnogram.edf...
Processing ST7091JE-Hypnogram.edf...
Processing ST7182JR-Hypnogram.edf...
Processing ST7102JE-Hypnogram.edf...
Processing ST7162JM-Hypnogram.edf...
Processing ST7222JA-Hypnogram.edf...
Processing ST7051JA-Hypnogram.edf

ValueError: min() arg is an empty sequence

In [21]:
# ============================================================================
# ENHANCED EDF PROCESSING
# ============================================================================

def process_hypnogram_enhanced(edf_path, subject_data=None):
    """
    Enhanced hypnogram processing with subject data integration
    """
    try:
        # Try MNE first
        raw = mne.io.read_raw_edf(edf_path, preload=False, verbose=False)
        annotations = raw.annotations
        
        # Extract metadata from filename
        filename = Path(edf_path).name
        file_metadata = parse_st_filename(filename)
        
        # Find matching subject data
        subject_info = find_subject_info(file_metadata, subject_data)
        
        # Process sleep stages
        hypnogram_data = []
        
        for i, (onset, duration, description) in enumerate(zip(
            annotations.onset, annotations.duration, annotations.description)):
            
            stage = map_sleep_stage(description)
            
            hypnogram_data.append({
                'epoch': i,
                'onset_seconds': float(onset),
                'duration_seconds': float(duration),
                'onset_minutes': float(onset) / 60,
                'onset_hours': float(onset) / 3600,
                'stage': stage,
                'stage_numeric': stage_to_numeric(stage),
                'raw_description': description.strip()
            })
        
        # Calculate comprehensive statistics
        summary_stats = calculate_comprehensive_stats(hypnogram_data)
        
        # Recording metadata
        recording_info = {
            'duration_hours': (annotations.onset[-1] + annotations.duration[-1]) / 3600 if len(annotations) > 0 else 0,
            'total_epochs': len(hypnogram_data),
            'start_time': raw.info['meas_date'].isoformat() if raw.info['meas_date'] else None,
            'sampling_rate': raw.info['sfreq'],
            'epoch_length_seconds': 30  # Standard sleep scoring epoch
        }
        
        return {
            'metadata': {
                'filename': filename,
                **file_metadata,
                'subject_info': subject_info,
                'study_type': 'Sleep Telemetry',
                'recording_info': recording_info
            },
            'hypnogram': hypnogram_data,
            'summary_stats': summary_stats
        }
        
    except Exception as e:
        print(f"Error processing {edf_path}: {str(e)}")
        return None

def parse_st_filename(filename):
    """
    Parse ST filename: ST7011JP-Hypnogram.edf
    ST = Study Type
    70 = Subject ID  
    1 = Night
    1 = Condition (0=placebo, 1=temazepam typically)
    J = Additional identifier
    P = Additional identifier
    """
    base = filename.replace('-Hypnogram.edf', '').replace('-PSG.edf', '')
    
    if base.startswith('ST7'):
        # Extract components
        subject_id = base[3:5] if len(base) >= 5 else None
        night_id = base[5:6] if len(base) >= 6 else None
        condition_code = base[6:7] if len(base) >= 7 else None
        
        # Map condition code
        condition_map = {'0': 'placebo', '1': 'temazepam', 'P': 'placebo', 'J': 'temazepam'}
        condition = condition_map.get(condition_code, condition_code)
        
        return {
            'subject_id': subject_id,
            'night_id': night_id,
            'condition': condition,
            'condition_code': condition_code
        }
    
    return {'subject_id': None, 'night_id': None, 'condition': None}

def find_subject_info(file_metadata, subject_data):
    """
    Find matching subject information from demographics data
    """
    if not subject_data or not file_metadata.get('subject_id'):
        return {'age': None, 'gender': None}
    
    subject_id = file_metadata['subject_id']
    night_id = file_metadata['night_id']
    condition = file_metadata['condition']
    
    # Look for matching record
    for record in subject_data:
        record_subj = str(record.get('subject', '')).zfill(2)
        record_night = str(record.get('night', ''))
        record_condition = str(record.get('condition', '')).lower()
        
        if (record_subj == subject_id and 
            record_night == night_id and
            condition and record_condition in condition.lower()):
            
            return {
                'age': record.get('age'),
                'gender': record.get('gender'),
                'condition_verified': record.get('condition')
            }
    
    # If no exact match, try just subject ID
    for record in subject_data:
        record_subj = str(record.get('subject', '')).zfill(2)
        if record_subj == subject_id:
            return {
                'age': record.get('age'),
                'gender': record.get('gender'),
                'condition_verified': None
            }
    
    return {'age': None, 'gender': None}

def map_sleep_stage(description):
    """Enhanced sleep stage mapping"""
    desc = description.lower().strip()
    
    # Comprehensive mapping
    if 'sleep stage w' in desc or desc == 'w':
        return 'Wake'
    elif 'sleep stage r' in desc or desc == 'r':
        return 'REM'
    elif 'sleep stage 1' in desc or desc == '1':
        return 'N1'
    elif 'sleep stage 2' in desc or desc == '2':
        return 'N2'
    elif 'sleep stage 3' in desc or desc == '3':
        return 'N3'
    elif 'sleep stage 4' in desc or desc == '4':
        return 'N4'
    elif 'movement' in desc or desc == 'm':
        return 'Movement'
    elif '?' in desc:
        return 'Unknown'
    else:
        return description  # Keep original if no mapping found

def stage_to_numeric(stage):
    """Convert stage to numeric for analysis"""
    mapping = {
        'Wake': 0, 'N1': 1, 'N2': 2, 'N3': 3, 'N4': 4, 
        'REM': 5, 'Movement': 6, 'Unknown': -1
    }
    return mapping.get(stage, -1)

def calculate_comprehensive_stats(hypnogram_data):
    """Calculate comprehensive sleep statistics"""
    if not hypnogram_data:
        return {
            'sleep_efficiency': 0.0,
            'rem_percentage': 0.0,
            'deep_sleep_percentage': 0.0,
            'light_sleep_percentage': 0.0,
            'wake_percentage': 100.0,
            'stage_percentages': {},
            'sleep_onset_minutes': None,
            'rem_onset_minutes': None,
            'total_sleep_time_hours': 0.0,
            'wake_after_sleep_onset_minutes': 0.0
        }
    
    # Calculate durations
    total_duration = sum(epoch['duration_seconds'] for epoch in hypnogram_data)
    stage_durations = {}
    
    for epoch in hypnogram_data:
        stage = epoch['stage']
        duration = epoch['duration_seconds']
        stage_durations[stage] = stage_durations.get(stage, 0) + duration
    
    # Stage percentages
    stage_percentages = {
        stage: (duration / total_duration) * 100 
        for stage, duration in stage_durations.items()
    }
    
    # Key metrics
    wake_time = stage_durations.get('Wake', 0)
    rem_time = stage_durations.get('REM', 0)
    n1_time = stage_durations.get('N1', 0)
    n2_time = stage_durations.get('N2', 0)
    n3_time = stage_durations.get('N3', 0)
    n4_time = stage_durations.get('N4', 0)
    
    total_sleep_time = total_duration - wake_time
    
    # Find sleep onset (first non-wake stage)
    sleep_onset_minutes = None
    rem_onset_minutes = None
    
    for epoch in hypnogram_data:
        if epoch['stage'] != 'Wake' and sleep_onset_minutes is None:
            sleep_onset_minutes = epoch['onset_minutes']
        if epoch['stage'] == 'REM' and rem_onset_minutes is None:
            rem_onset_minutes = epoch['onset_minutes']
    
    return {
        'sleep_efficiency': round((total_sleep_time / total_duration) * 100, 2) if total_duration > 0 else 0,
        'rem_percentage': round((rem_time / total_duration) * 100, 2),
        'deep_sleep_percentage': round(((n3_time + n4_time) / total_duration) * 100, 2),
        'light_sleep_percentage': round(((n1_time + n2_time) / total_duration) * 100, 2),
        'wake_percentage': round((wake_time / total_duration) * 100, 2),
        'stage_percentages': {k: round(v, 2) for k, v in stage_percentages.items()},
        'sleep_onset_minutes': round(sleep_onset_minutes, 1) if sleep_onset_minutes else None,
        'rem_onset_minutes': round(rem_onset_minutes, 1) if rem_onset_minutes else None,
        'total_sleep_time_hours': round(total_sleep_time / 3600, 2),
        'wake_after_sleep_onset_minutes': round((wake_time - (sleep_onset_minutes * 60 if sleep_onset_minutes else 0)) / 60, 1) if sleep_onset_minutes else 0
    }


In [23]:
# Add this cell to debug and fix the issues

# 1. First, let's test the subject parsing with a simpler format
def test_subject_parsing():
    """Test subject parsing with debug output"""
    
    # Try the properly formatted data
    test_data = """
1 60 1 1 23:01 2 23:48
2 35 2 2 23:27 1 0:00
4 18 2 1 23:53 2 22:37
5 32 2 2 23:23 1 23:34
"""
    
    print("Testing subject parsing...")
    print(f"Input data:\n{test_data}")
    
    lines = test_data.strip().split('\n')
    print(f"Found {len(lines)} lines")
    
    subjects = []
    
    # Process each line (no header to skip)
    for i, line in enumerate(lines):
        if not line.strip():
            continue
            
        print(f"Processing line {i}: '{line}'")
        parts = line.split()
        print(f"  Split into {len(parts)} parts: {parts}")
        
        if len(parts) >= 7:  # Need at least 7 parts
            try:
                subject_num = int(parts[0])
                age = int(parts[1])
                sex_code = int(parts[2])  # 1=Male, 2=Female
                gender = 'M' if sex_code == 1 else 'F'
                
                placebo_night = int(parts[3])
                placebo_time = parts[4]
                temazepam_night = int(parts[5]) 
                temazepam_time = parts[6]
                
                print(f"  Parsed: Subject {subject_num}, Age {age}, Gender {gender}")
                
                # Create records for both nights
                subjects.append({
                    'subject': f"{subject_num:02d}",
                    'age': age,
                    'gender': gender,
                    'night': placebo_night,
                    'condition': 'placebo',
                    'lights_off_time': placebo_time,
                    'study': 'ST'
                })
                
                subjects.append({
                    'subject': f"{subject_num:02d}",
                    'age': age,
                    'gender': gender,
                    'night': temazepam_night,
                    'condition': 'temazepam',
                    'lights_off_time': temazepam_time,
                    'study': 'ST'
                })
                
            except Exception as e:
                print(f"  Error parsing line: {e}")
        else:
            print(f"  Skipping line - not enough parts ({len(parts)})")
    
    print(f"\nCreated {len(subjects)} subject records:")
    for subj in subjects[:4]:  # Show first 4
        print(f"  {subj}")
    
    return subjects

# Test the parsing
test_subjects = test_subject_parsing()

# 2. Fixed subject parsing function
def process_st_subjects_from_text_fixed(text_data):
    """
    Fixed version of subject parsing
    """
    # Clean the data - remove the messy header and use clean format
    clean_data = """
1 60 1 1 23:01 2 23:48
2 35 2 2 23:27 1 0:00
4 18 2 1 23:53 2 22:37
5 32 2 2 23:23 1 23:34
6 35 2 1 23:28 2 23:26
7 51 2 1 0:02 2 23:24
8 66 2 2 23:20 1 23:53
9 47 1 2 0:30 1 23:42
10 20 2 1 23:21 2 23:28
11 21 2 2 23:52 1 23:38
12 21 1 1 23:46 2 23:56
13 22 1 2 0:31 1 0:38
14 20 1 1 0:40 2 0:53
15 66 2 1 23:42 2 23:33
16 79 2 2 23:21 1 23:18
17 48 2 1 23:40 2 23:48
18 53 2 2 23:38 1 23:24
19 28 2 2 23:22 1 23:44
20 24 1 1 23:47 2 0:01
21 34 2 2 23:44 1 23:10
22 56 1 1 23:22 2 23:44
24 48 2 1 23:27 2 23:36
"""
    
    lines = clean_data.strip().split('\n')
    subjects = []
    
    for line in lines:
        if not line.strip():
            continue
            
        parts = line.split()
        if len(parts) >= 7:
            try:
                subject_num = int(parts[0])
                age = int(parts[1])
                sex_code = int(parts[2])  # 1=Male, 2=Female
                gender = 'M' if sex_code == 1 else 'F'
                
                placebo_night = int(parts[3])
                placebo_time = parts[4]
                temazepam_night = int(parts[5]) 
                temazepam_time = parts[6]
                
                # Create records for both nights
                subjects.append({
                    'subject': f"{subject_num:02d}",
                    'age': age,
                    'gender': gender,
                    'night': placebo_night,
                    'condition': 'placebo',
                    'lights_off_time': placebo_time,
                    'study': 'ST'
                })
                
                subjects.append({
                    'subject': f"{subject_num:02d}",
                    'age': age,
                    'gender': gender,
                    'night': temazepam_night,
                    'condition': 'temazepam',
                    'lights_off_time': temazepam_time,
                    'study': 'ST'
                })
                
            except Exception as e:
                print(f"Error parsing line '{line}': {e}")
    
    print(f"Processed {len(subjects)} subject records from {len(lines)} subjects")
    return subjects

# 3. Fixed summary function to handle empty age data
def create_dataset_summary_fixed(processed_data, subject_data):
    """
    Fixed version that handles empty age distributions
    """
    summary = {
        'total_subjects': len(set(record['metadata']['subject_info'].get('age') for record in processed_data if record and record['metadata']['subject_info'].get('age'))),
        'total_recordings': len(processed_data),
        'conditions': {},
        'age_distribution': [],
        'gender_distribution': {},
        'sleep_stats_by_condition': {},
        'sleep_stats_by_age_group': {},
        'sleep_stats_by_gender': {}
    }
    
    # Group by conditions
    condition_groups = {}
    age_groups = {'18-30': [], '31-50': [], '51-70': [], '70+': []}
    gender_groups = {'M': [], 'F': []}
    
    for record in processed_data:
        if not record:
            continue
            
        condition = record['metadata'].get('condition', 'unknown')
        age = record['metadata']['subject_info'].get('age')
        gender = record['metadata']['subject_info'].get('gender')
        stats = record['summary_stats']
        
        # Condition grouping
        if condition not in condition_groups:
            condition_groups[condition] = []
        condition_groups[condition].append(stats)
        
        # Age grouping
        if age:
            if age <= 30:
                age_groups['18-30'].append(stats)
            elif age <= 50:
                age_groups['31-50'].append(stats)
            elif age <= 70:
                age_groups['51-70'].append(stats)
            else:
                age_groups['70+'].append(stats)
        
        # Gender grouping
        if gender in gender_groups:
            gender_groups[gender].append(stats)
    
    # Calculate averages for each group
    summary['conditions'] = {k: len(v) for k, v in condition_groups.items()}
    summary['sleep_stats_by_condition'] = {
        condition: calculate_group_averages(stats_list)
        for condition, stats_list in condition_groups.items()
    }
    
    summary['sleep_stats_by_age_group'] = {
        age_group: calculate_group_averages(stats_list)
        for age_group, stats_list in age_groups.items() if stats_list
    }
    
    summary['sleep_stats_by_gender'] = {
        gender: calculate_group_averages(stats_list)
        for gender, stats_list in gender_groups.items() if stats_list
    }
    
    # Age and gender distributions
    ages = [record['metadata']['subject_info'].get('age') for record in processed_data 
            if record and record['metadata']['subject_info'].get('age')]
    genders = [record['metadata']['subject_info'].get('gender') for record in processed_data 
               if record and record['metadata']['subject_info'].get('gender')]
    
    summary['age_distribution'] = ages
    summary['gender_distribution'] = {gender: genders.count(gender) for gender in set(genders) if gender}
    
    return summary

# 4. Test with your actual processed data
def reprocess_with_demographics(processed_data_file="processed_sleep_data/combined_sleep_data.json"):
    """
    Reprocess to add demographics to existing processed data
    """
    import json
    
    # Load your existing processed data
    try:
        with open(processed_data_file, 'r') as f:
            processed_data = json.load(f)
        print(f"Loaded {len(processed_data)} existing records")
    except:
        print("Could not load existing data")
        return
    
    # Get the fixed subject data
    subject_data = process_st_subjects_from_text_fixed("")
    print(f"Created {len(subject_data)} subject records")
    
    # Update each record with demographics
    updated_count = 0
    for record in processed_data:
        if not record:
            continue
            
        file_metadata = record['metadata']
        subject_info = find_subject_info(file_metadata, subject_data)
        
        if subject_info.get('age'):
            record['metadata']['subject_info'] = subject_info
            updated_count += 1
    
    print(f"Updated {updated_count} records with demographics")
    
    # Create new summary
    summary = create_dataset_summary_fixed(processed_data, subject_data)
    
    # Save updated data
    with open("processed_sleep_data/combined_sleep_data_fixed.json", 'w') as f:
        json.dump(processed_data, f, indent=2)
    
    with open("processed_sleep_data/dataset_summary_fixed.json", 'w') as f:
        json.dump(summary, f, indent=2)
    
    with open("processed_sleep_data/st_subjects_processed_fixed.json", 'w') as f:
        json.dump(subject_data, f, indent=2)
    
    # Print summary
    print(f"\n=== FIXED PROCESSING SUMMARY ===")
    print(f"Total recordings: {len(processed_data)}")
    print(f"Records with demographics: {updated_count}")
    print(f"Conditions found: {list(summary['conditions'].keys())}")
    if summary['age_distribution']:
        print(f"Age range: {min(summary['age_distribution'])}-{max(summary['age_distribution'])} years")
    print(f"Gender distribution: {summary['gender_distribution']}")
    
    return processed_data, summary

# Run the fix
print("Testing subject parsing first...")
test_subjects = test_subject_parsing()

print("\nNow reprocessing your data with demographics...")
fixed_data, fixed_summary = reprocess_with_demographics()

Testing subject parsing...
Input data:

1 60 1 1 23:01 2 23:48
2 35 2 2 23:27 1 0:00
4 18 2 1 23:53 2 22:37
5 32 2 2 23:23 1 23:34

Found 4 lines
Processing line 0: '1 60 1 1 23:01 2 23:48'
  Split into 7 parts: ['1', '60', '1', '1', '23:01', '2', '23:48']
  Parsed: Subject 1, Age 60, Gender M
Processing line 1: '2 35 2 2 23:27 1 0:00'
  Split into 7 parts: ['2', '35', '2', '2', '23:27', '1', '0:00']
  Parsed: Subject 2, Age 35, Gender F
Processing line 2: '4 18 2 1 23:53 2 22:37'
  Split into 7 parts: ['4', '18', '2', '1', '23:53', '2', '22:37']
  Parsed: Subject 4, Age 18, Gender F
Processing line 3: '5 32 2 2 23:23 1 23:34'
  Split into 7 parts: ['5', '32', '2', '2', '23:23', '1', '23:34']
  Parsed: Subject 5, Age 32, Gender F

Created 8 subject records:
  {'subject': '01', 'age': 60, 'gender': 'M', 'night': 1, 'condition': 'placebo', 'lights_off_time': '23:01', 'study': 'ST'}
  {'subject': '01', 'age': 60, 'gender': 'M', 'night': 2, 'condition': 'temazepam', 'lights_off_time': '23: