# Data Processing

Processes CGM and heart rate data from JSON files and converts them to CSV format for each patient.

In [3]:
import json
import pandas as pd
import os
from pathlib import Path

# Prep CGM Data

In [14]:
# Define paths
dexcom_g6_path = Path('dataset/wearable_blood_glucose/continuous_glucose_monitoring/dexcom_g6')
output_dir = Path('cgm_csv')

# Create output directory if it doesn't exist
output_dir.mkdir(exist_ok=True)

print(f"Source directory: {dexcom_g6_path}")
print(f"Output directory: {output_dir}")

Source directory: dataset/wearable_blood_glucose/continuous_glucose_monitoring/dexcom_g6
Output directory: cgm_csv


In [15]:
def process_cgm_json(json_file_path, patient_id):
    """
    Process a single patient's JSON file and extract CGM time series data.
    
    Parameters:
    - json_file_path: Path to the JSON file
    - patient_id: Patient ID (folder name)
    
    Returns:
    - DataFrame with time series data
    """
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    # Extract CGM data from the body
    cgm_records = data.get('body', {}).get('cgm', [])
    
    # Parse each CGM record
    time_series_data = []
    for record in cgm_records:
        time_interval = record.get('effective_time_frame', {}).get('time_interval', {})
        blood_glucose_data = record.get('blood_glucose', {})
        transmitter_time_data = record.get('transmitter_time', {})
        
        time_series_data.append({
            #'patient_id': patient_id,
            'start_date_time': time_interval.get('start_date_time'),
            'end_date_time': time_interval.get('end_date_time'),
            'glucose_level_mg_dl': blood_glucose_data.get('value'),
            'transmitter_time': transmitter_time_data.get('value'),
        })
    
    # Create DataFrame
    df = pd.DataFrame(time_series_data)
    
    return df

In [16]:
# Get all patient folders
patient_folders = [f for f in dexcom_g6_path.iterdir() if f.is_dir()]
print(f"Found {len(patient_folders)} patient folders")

# Process each patient
processed_count = 0
error_count = 0

for patient_folder in sorted(patient_folders):
    patient_id = patient_folder.name
    
    # Find the JSON file in the patient folder
    json_files = list(patient_folder.glob('*.json'))
    
    if not json_files:
        print(f"Warning: No JSON file found for patient {patient_id}")
        error_count += 1
        continue
    
    if len(json_files) > 1:
        print(f"Warning: Multiple JSON files found for patient {patient_id}, using first one")
    
    json_file = json_files[0]
    
    try:
        # Process the JSON file
        df = process_cgm_json(json_file, patient_id)
        
        # Save to CSV
        output_csv = output_dir / f"{patient_id}.csv"
        df.to_csv(output_csv, index=False)
        
        processed_count += 1
        
        if processed_count % 100 == 0:
            print(f"Processed {processed_count} patients...")
    
    except Exception as e:
        print(f"Error processing patient {patient_id}: {str(e)}")
        error_count += 1

print(f"\n{'='*60}")
print(f"Processing complete!")
print(f"Successfully processed: {processed_count} patients")
print(f"Errors encountered: {error_count} patients")
print(f"CSV files saved to: {output_dir.absolute()}")
print(f"{'='*60}")

Found 1049 patient folders
Processed 100 patients...
Processed 100 patients...
Processed 200 patients...
Processed 200 patients...
Processed 300 patients...
Processed 300 patients...
Processed 400 patients...
Processed 400 patients...
Processed 500 patients...
Processed 500 patients...
Processed 600 patients...
Processed 600 patients...
Processed 700 patients...
Processed 700 patients...
Processed 800 patients...
Processed 800 patients...
Processed 900 patients...
Processed 900 patients...
Processed 1000 patients...
Processed 1000 patients...

Processing complete!
Successfully processed: 1049 patients
Errors encountered: 0 patients
CSV files saved to: /Users/jasonfan/Documents/Emory/coursework/data_mining/project/cgm_csv

Processing complete!
Successfully processed: 1049 patients
Errors encountered: 0 patients
CSV files saved to: /Users/jasonfan/Documents/Emory/coursework/data_mining/project/cgm_csv


# Prep Heart Rate Data

In [17]:
# Define paths for heart rate data
garmin_vivosmart5_path = Path('dataset/wearable_activity_monitor/heart_rate/garmin_vivosmart5')
hr_output_dir = Path('heart_rate_csv')

# Create output directory if it doesn't exist
hr_output_dir.mkdir(exist_ok=True)

print(f"Source directory: {garmin_vivosmart5_path}")
print(f"Output directory: {hr_output_dir}")

Source directory: dataset/wearable_activity_monitor/heart_rate/garmin_vivosmart5
Output directory: heart_rate_csv


In [19]:
def process_heart_rate_json(json_file_path, patient_id):
    """
    Process a single patient's heart rate JSON file and extract time series data.
    
    Parameters:
    - json_file_path: Path to the JSON file
    - patient_id: Patient ID (folder name)
    
    Returns:
    - DataFrame with time series data
    """
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    # Extract heart rate data from the body
    hr_records = data.get('body', {}).get('heart_rate', [])
    
    # Parse each heart rate record
    time_series_data = []
    for record in hr_records:
        hr_data = record.get('heart_rate', {})
        time_frame = record.get('effective_time_frame', {})
        
        time_series_data.append({
            #'patient_id': patient_id,
            'date_time': time_frame.get('date_time'),
            'heart_rate_beats_min': hr_data.get('value')
        })
    
    # Create DataFrame
    df = pd.DataFrame(time_series_data)
    
    return df

In [20]:
# Get all patient folders for heart rate data
hr_patient_folders = [f for f in garmin_vivosmart5_path.iterdir() if f.is_dir()]
print(f"Found {len(hr_patient_folders)} patient folders with heart rate data")

# Process each patient
hr_processed_count = 0
hr_error_count = 0

for patient_folder in sorted(hr_patient_folders):
    patient_id = patient_folder.name
    
    # Find the JSON file in the patient folder
    json_files = list(patient_folder.glob('*_heartrate.json'))
    
    if not json_files:
        print(f"Warning: No heart rate JSON file found for patient {patient_id}")
        hr_error_count += 1
        continue
    
    if len(json_files) > 1:
        print(f"Warning: Multiple JSON files found for patient {patient_id}, using first one")
    
    json_file = json_files[0]
    
    try:
        # Process the JSON file
        df = process_heart_rate_json(json_file, patient_id)
        
        # Save to CSV
        output_csv = hr_output_dir / f"{patient_id}.csv"
        df.to_csv(output_csv, index=False)
        
        hr_processed_count += 1
        
        if hr_processed_count % 100 == 0:
            print(f"Processed {hr_processed_count} patients...")
    
    except Exception as e:
        print(f"Error processing patient {patient_id}: {str(e)}")
        hr_error_count += 1

print(f"\n{'='*60}")
print(f"Heart Rate Processing complete!")
print(f"Successfully processed: {hr_processed_count} patients")
print(f"Errors encountered: {hr_error_count} patients")
print(f"CSV files saved to: {hr_output_dir.absolute()}")
print(f"{'='*60}")

Found 903 patient folders with heart rate data
Processed 100 patients...
Processed 100 patients...
Processed 200 patients...
Processed 200 patients...
Processed 300 patients...
Processed 300 patients...
Processed 400 patients...
Processed 400 patients...
Processed 500 patients...
Processed 500 patients...
Processed 600 patients...
Processed 600 patients...
Processed 700 patients...
Processed 700 patients...
Processed 800 patients...
Processed 800 patients...
Processed 900 patients...

Heart Rate Processing complete!
Successfully processed: 903 patients
Errors encountered: 0 patients
CSV files saved to: /Users/jasonfan/Documents/Emory/coursework/data_mining/project/heart_rate_csv
Processed 900 patients...

Heart Rate Processing complete!
Successfully processed: 903 patients
Errors encountered: 0 patients
CSV files saved to: /Users/jasonfan/Documents/Emory/coursework/data_mining/project/heart_rate_csv
