# Space

In [None]:
import sys
import os 
import logging
import pandas as pd
from pprint import pprint 
from IPython.display import display, HTML

KEY = '1-WORKSPACE'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'./_Data/0-Data_Raw',
    'DATA_RFT': f'./_Data/1-Data_RFT',
    'DATA_CASE': f'./_Data/2-Data_CASE',
    'DATA_AIDATA': f'./_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'./code/external',
    'CODE_FN': f'./code/pipeline', 
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'

print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

# Step 1: OneCohort_Args



This step is foundational for the rest of the notebook, as it ensures that the correct settings and parameters are in place for processing the cohort's data.

In [None]:
# Import the cohort configuration dictionary
from config.config_record.Cohort import CohortName_to_OneCohortArgs

# List available cohort names
cohort_names = list(CohortName_to_OneCohortArgs.keys())
print("Available Cohorts:", cohort_names)

# Select a specific cohort and retrieve its arguments
# selected_cohort = 'WellDoc2023CVSDeRx'
selected_cohort ='aireadi-noimage-v2'
cohort_args = CohortName_to_OneCohortArgs[selected_cohort]
print("Selected Cohort Arguments:", cohort_args)

In [None]:
# # %%%%%%%%%%%%%%%%%%%%% user
# CohortName = 'WellDoc2022CGM'
# CohortName = 'WellDoc2023CVSDeRx'
# CohortName = 'WellDoc2023CVSTDC'

# CohortName = 'WellDoc2025CVS'
CohortName = 'aireadi-noimage-v2'
# CohortName = 'WellDoc2025LLY'
# # %%%%%%%%%%%%%%%%%%%%% 
OneCohort_Args = CohortName_to_OneCohortArgs[CohortName]
OneCohort_Args

In [None]:
from recfldtkn.record_base.cohort import Cohort   # Cohort is a class

# Define a placeholder for a cohort funciton 
cohort_fn = None
# Initialize an object of the Cohort class with initial arguments
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)  # cohort is an object of class Cohort, this object  will manage dataset parameters
# Update the cohort arguments using the update_cohort_args method
# Input is OneCohort_Args and SPACE, which are two dictionaries and the 
OneCohort_Args = cohort.update_cohort_args(OneCohort_Args, SPACE) # update cohort args
# Pretty print the updated cohort argument
pprint(OneCohort_Args, sort_dicts=False)

# Step 2: Get Source Files
The purpose of this code segment is to retrieve all files with a specific suffix (in this case, .csv) from a specified folder and list their paths. The folder path and file suffix list are specified by the user.

In [None]:
# %%%%%%%%%%%%%%%%%%%%% user
# Define file suffix list to look for .csv files
SourceFile_SuffixList = ['json', 'tsv', 'csv', 'dat', 'hea'] 
# %%%%%%%%%%%%%%%%%%%%% user

# Get the folder path from OneCohort_Args dictionary (provided by the user)
Folder = OneCohort_Args['SourcePath'] 

# Use the cohort object to get a list of source files with the specified suffix
SourceFile_List = cohort.get_SourceFile_List(Folder, SourceFile_SuffixList)

# Print the list of source files
SourceFile_List

# Step 3: Get RawName from SourceFile

In [None]:
import inspect

# %%%%%%%%%%%%%%%%%%%%% user
def get_RawName_from_SourceFile(file_path, OneCohort_Args):
    """
    Extracts a 'raw name' from a given file path.

    This function takes a file path and extracts what is assumed to be a 'raw name'
    by splitting the path and selecting specific parts. The 'raw name' is considered
    to be the last part of the file name before the file extension.

    Args:
        file_path (str): The full path of the file from which to extract the raw name.
        OneCohort_Args: Currently unused. Reserved for future functionality.

    Returns:
        str: The extracted 'raw name' from the file path.

    """
    RawName = file_path.split('_')[-1].split('.')[0]
    return RawName

get_RawName_from_SourceFile.fn_string = inspect.getsource(get_RawName_from_SourceFile)
# %%%%%%%%%%%%%%%%%%%%% 

file_path = SourceFile_List[0]
print(type(file_path))
RawName = get_RawName_from_SourceFile(file_path, OneCohort_Args)
print(file_path)
print(RawName)

In [None]:
SourceFile_List

# Step 4: Process Source to Raw

## participants & clinical data

In [None]:
from datetime import datetime 
import pytz
participants_xml_file_list  = [i for i in SourceFile_List if 'participants' in i]
participants_xml_file = participants_xml_file_list[0]
print(participants_xml_file)

participants_tsv_file = participants_xml_file.replace('.json', '.tsv')

df = pd.read_csv(participants_tsv_file, sep='\t')
df['YearOfBirth'] = pd.to_datetime(df['study_visit_date']).dt.year - df['age']
# df
df['study_group'].value_counts()


clinical_site_to_timezone = {
    'UAB': 'America/Chicago',
    'UW': 'America/Los_Angeles',
    'UCSD': 'America/Los_Angeles',
}

df['UserTimeZone'] = df['clinical_site'].map(clinical_site_to_timezone)
# df['UserTimeZoneOffset'] = df['UserTimeZone'].apply(lambda tz: int(datetime.now(pytz.timezone(tz)).utcoffset().total_seconds() / 3600))
df['DiseaseType'] = df['study_group'].map({
    'healthy': 0, 
    'pre_diabetes_lifestyle_controlled': 0.5, 
    'oral_medication_and_or_non_insulin_injectable_medication_controlled': 2, 
    'insulin_dependent': 2})

df['MRSegmentID'] = df['study_group']

RawName = 'Patient'
raw_columns = ['PatientID', 
               'MRSegmentID', 'MRSegmentModifiedDateTime', 'DiseaseType',
                             'Gender', 'ActivationDate', 'UserTimeZoneOffset', 'UserTimeZone',
                             'Description', 'YearOfBirth']

df = df.rename(columns = {'participant_id': 'PatientID'})
df = df.reindex(columns = raw_columns)

df_user = df# [['PatientID', 'UserTimeZone', 'UserTimeZoneOffset']]

df_user['PatientID'] = 'AIREADI-' + df_user['PatientID'].astype(str)
df_user.head()

print(df_user.shape)


In [None]:
df['UserTimeZone'].value_counts(dropna = False)

In [None]:
import pandas as pd 

pd.set_option('display.max_columns', None)

measurement_file = [i for i in SourceFile_List if 'measurement' in i][0]
# measurement_file

df_measurement = pd.read_csv(measurement_file)
# df_measurement

measurement_type_list = list(df_measurement['measurement_source_value'].value_counts().index)
# pprint(measurement_type_list)

df_measurement.head()

In [None]:
person_file = [i for i in SourceFile_List if 'person' in i][0]
# person_file

df_person = pd.read_csv(person_file)
df_person.head()

In [None]:
observation_file = [i for i in SourceFile_List if 'observation' in i][0]
observation_file
df_observation = pd.read_csv(observation_file)
print(df_observation.shape)
df_observation.head()
# data

## wearable_blood_glucose - cgm data

In [None]:
cgm_json_list = [i for i in SourceFile_List if 'wearable_blood_glucose' in i and  'json' in i]

cgm_json_path = cgm_json_list[0]
print(cgm_json_path)


import json 

with open(cgm_json_path, 'r') as f:
    data = json.load(f)

data


In [None]:
# Process all CGM files from the cgm_json_list and combine into one DataFrame
import os
from tqdm.notebook import tqdm


def convert_cgm_json_to_df(json_path):
    """
    Convert CGM JSON data to a pandas DataFrame.
    
    Args:
        json_path (str): Path to the CGM JSON file
        
    Returns:
        pd.DataFrame: DataFrame containing CGM data with all available columns
    """
    import pandas as pd
    import json
    from datetime import datetime
    
    # Load JSON data
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    # Extract patient ID from header
    patient_id = data['header']['patient_id']
    if 'AIREADI-' not in patient_id:
        patient_id = 'AIREADI-' + patient_id
    
    # Initialize lists to store data
    records = []
    
    # Process CGM readings
    for reading in data['body']['cgm']:
        # Create a base record with patient ID
        record = {'patient_id': patient_id}
        
        # Extract timestamp from effective_time_frame
        if 'effective_time_frame' in reading and 'time_interval' in reading['effective_time_frame']:
            time_interval = reading['effective_time_frame']['time_interval']
            if 'start_date_time' in time_interval:
                record['timestamp'] = time_interval['start_date_time']
        
        # Extract blood glucose value
        if 'blood_glucose' in reading:
            record['glucose_value'] = reading['blood_glucose']['value']
            record['glucose_unit'] = reading['blood_glucose']['unit']
        
        # Add all other available fields
        for key, value in reading.items():
            if key not in ['effective_time_frame', 'blood_glucose']:
                if isinstance(value, dict):
                    for sub_key, sub_value in value.items():
                        record[f"{key}_{sub_key}"] = sub_value
                else:
                    record[key] = value
        
        records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(records)
    
    # Convert timestamp to datetime if it exists
    if 'timestamp' in df.columns:
        # df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')
    
    
    
    
    return df

convert_cgm_json_to_df.fn_string = inspect.getsource(convert_cgm_json_to_df)

# Test the function with the first CGM JSON file
cgm_df = convert_cgm_json_to_df(cgm_json_path)


# print(f"Found {len(cgm_json_list)} CGM JSON files")

# Process all CGM files and combine into one DataFrame
all_cgm_data = []

for file_path in tqdm(cgm_json_list, desc="Processing CGM files"):
    try:
        # Extract patient ID from the file path
        patient_id = os.path.basename(os.path.dirname(file_path))
        
        # Convert the JSON file to DataFrame
        df = convert_cgm_json_to_df(file_path)
        
        # Add patient ID if not already in the DataFrame
        if 'patient_id' not in df.columns:
            df['patient_id'] = patient_id
            
        all_cgm_data.append(df)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

final_cgm_df = pd.concat(all_cgm_data, ignore_index=True)

final_cgm_df.shape 


In [None]:
final_cgm_df['patient_id'].value_counts().sort_index().describe()

In [None]:
df_user['UserTimeZone'].value_counts(dropna = False)

In [None]:
RawName = 'ElogBGEntry'
raw_columns = ['BGEntryID', 'PatientID', 'ObservationDateTime', 'BGValue',
                'IsNormalIndicator', 'ObservationEntryDateTime', 'TimezoneOffset',
                'Timezone', 'EntryCreatedDateTime', 'ActualBGValue',
                'ExternalSourceID', 'UserObservationDateTime']

df = final_cgm_df
df = df.rename(columns = {'patient_id': 'PatientID', 'glucose_value': 'BGValue', 'timestamp': 'ObservationDateTime'})
df = df.reindex(columns = raw_columns)
df = pd.merge(df, df_user[['PatientID', 'UserTimeZone']], on = 'PatientID', how = 'left')
df['BGEntryID'] = df.index

df['ObservationDateTime']

In [None]:
dt = df['ObservationDateTime'].iloc[0]
dt

In [None]:
timezone_name = df_user['UserTimeZone'].iloc[0] # .value_counts(dropna = False)

In [None]:
def get_timezone_offset_minutes_corrected(dt, timezone_name):
    """
    Get timezone offset in minutes for a specific datetime
    
    Args:
        dt: datetime object (in UTC, either timezone-aware or naive)
        timezone_name: string like 'America/Los_Angeles'
        
    Returns:
        offset in minutes (negative for timezones west of UTC)
    """
    import pytz
    
    # Ensure dt is timezone-aware UTC
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=pytz.UTC)
    elif dt.tzinfo != pytz.UTC:
        dt = dt.astimezone(pytz.UTC)
        
    # Get the timezone object
    tz = pytz.timezone(timezone_name)
    
    # Convert to target timezone
    localized_dt = dt.astimezone(tz)
    
    # Get the offset in seconds and convert to minutes
    offset_seconds = localized_dt.utcoffset().total_seconds()
    offset_minutes = int(offset_seconds / 60)
    
    return offset_minutes

get_timezone_offset_minutes_corrected.fn_string = inspect.getsource(get_timezone_offset_minutes_corrected)

In [None]:
offset_minutes = get_timezone_offset_minutes_corrected(dt, timezone_name)
offset_minutes

In [None]:
df['TimezoneOffset'] = df.apply(lambda row: get_timezone_offset_minutes_corrected(
                                                    row['ObservationDateTime'], 
                                                    row['UserTimeZone']), 
                                                    axis = 1)
df['ExternalSourceID'] = 18
# df['EntryCreatedDateTime'] = pd.to_datetime(df['ObservationDateTime'])

df

In [None]:
# df['ObservationDateTime'].apply()

df['ObservationDateTime'] = df['ObservationDateTime'].dt.tz_localize(None)

In [None]:
# df['BGValue']

print(df.shape)
df['BGValue'] = pd.to_numeric(df['BGValue'], errors='coerce')
df = df[df['BGValue'].notna()].reset_index(drop = True)
print(df.shape)



## wearable activity - heart rate

In [None]:
heart_rate_file = [i for i in SourceFile_List if 'heart_rate' in i][0]
heart_rate_file

In [None]:
import json
with open(heart_rate_file, 'r') as f:
    data = json.load(f)
# data

In [None]:
def convert_heart_rate_json_to_df(json_data):
    """
    Convert heart rate JSON data to a pandas DataFrame.
    
    Parameters:
    -----------
    json_data : dict
        JSON data containing heart rate measurements
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing heart rate data with columns:
        - patient_id: ID of the patient
        - heart_rate: Heart rate value in beats/min
        - unit: Unit of measurement
        - timestamp: Datetime of the measurement
    """
    heart_rate_records = []
    
    # Get patient ID from the header
    patient_id = json_data['header']['uuid']
    
    # Process each heart rate measurement
    for hr_entry in json_data['body']['heart_rate']:
        heart_rate_records.append({
            'patient_id': patient_id,
            'heart_rate': hr_entry['heart_rate']['value'],
            'unit': hr_entry['heart_rate']['unit'],
            'timestamp': hr_entry['effective_time_frame']['date_time']
        })
    
    # Convert to DataFrame
    df = pd.DataFrame(heart_rate_records)
    
    # Convert timestamp to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Sort by timestamp
    df = df.sort_values('timestamp')
    
    return df

# Convert the heart rate data to DataFrame
df_heart_rate = convert_heart_rate_json_to_df(data)

# Display information about the DataFrame
print(f"Heart rate data for patient {df_heart_rate['patient_id'].iloc[0]}")
print(f"Number of records: {len(df_heart_rate)}")
print(f"Date range: {df_heart_rate['timestamp'].min()} to {df_heart_rate['timestamp'].max()}")
print("\nSample data:")
df_heart_rate.head(15)

In [None]:
convert_heart_rate_json_to_df.fn_string = inspect.getsource(convert_heart_rate_json_to_df)
hr_json_list = [i for i in SourceFile_List if 'heart_rate' in i]# [:10]
# Test the function with the first CGM JSON file
hr_df = convert_heart_rate_json_to_df(data)

# Process all CGM files and combine into one DataFrame
all_hr_data = []

print('The number of heart rate files: ', len(hr_json_list))

for file_path in tqdm(hr_json_list, desc="Processing HR files"):
    try:
        # Extract patient ID from the file path
        patient_id = os.path.basename(os.path.dirname(file_path))
        with open(file_path, 'r') as f:
            data = json.load(f)

        
        # Convert the JSON file to DataFrame
        df = convert_heart_rate_json_to_df(data)
        
        # Add patient ID if not already in the DataFrame
        if 'patient_id' not in df.columns:
            df['patient_id'] = patient_id
            
        all_hr_data.append(df)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

final_hr_df = pd.concat(all_hr_data, ignore_index=True)

final_hr_df.shape 

In [None]:
######### Alina: Rename Columns
df = final_hr_df

RawName = 'HeartRate'
#originalname: 'wearable_blood_glucose'
raw_columns =['HREntryID', 'PatientID', 'ObservationDateTime', 'HRValue','HRUnit',
              'ObservationEntryDateTime', 'TimezoneOffset',
              'Timezone', 'EntryCreatedDateTime'
              ]
df = df.rename(columns = {'patient_id': 'PatientID', 
                          'timestamp': 'ObservationDateTime',
                          'heart_rate': 'HRValue',
                          'unit':'HRUnit',
                          })
#TODO: What is transmitter_time_unit, source_device_id VS transmitter_id
df = df.reindex(columns = raw_columns)
df['HREntryID'] = df.index
df = pd.merge(df, df_user[['PatientID', 'UserTimeZone']], on = 'PatientID', how = 'left') 
df['TimezoneOffset'] = df.apply(lambda row: get_timezone_offset_minutes_corrected(
                                                    row['ObservationDateTime'], 
                                                    row['UserTimeZone']), 
                                                    axis = 1)
df['ObservationDateTime'] = df['ObservationDateTime'].dt.tz_localize(None)
df['EntryCreatedDateTime'] = pd.to_datetime(df['ObservationDateTime'])
df.head()

In [None]:
df['PatientID'].value_counts()

In [None]:

# Convert TimezoneOffset to numeric to ensure proper calculation
df['TimezoneOffset'] = pd.to_numeric(df['TimezoneOffset'])
# Create local datetime by adding the timezone offset
df['DT_local'] = df['ObservationDateTime'] + pd.to_timedelta(df['TimezoneOffset'], unit='minutes')
df

In [None]:
df['PatientID'].value_counts()

In [None]:
pid_value = 'AIREADI-4193'

dfx = df[df['PatientID'] == pid_value]

# Plot heart rate values over time for patient AIREADI-1080
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

plt.figure(figsize=(20, 6))
plt.plot(dfx['DT_local'], dfx['HRValue'], '-o', markersize=2)
plt.title(f'Heart Rate Over Time for Patient {pid_value}')
plt.xlabel('Date Time (Local)')
plt.ylabel('Heart Rate (beats/min)')
plt.grid(True)

# Format x-axis to show dates nicely
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H:%M'))
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
plt.gcf().autofmt_xdate()  # Rotate date labels

plt.tight_layout()
plt.show()

    

In [None]:
len(df)

## wearable activity - oxygen saturation

In [None]:
oxygen_saturation_file_list = [i for i in SourceFile_List if 'oxygen_saturation' in i and 'json' in i]
oxygen_saturation_file = oxygen_saturation_file_list[0]
oxygen_saturation_file

with open(oxygen_saturation_file, 'r') as f:
    data = json.load(f)

data

In [None]:
def convert_oxygen_saturation_json_to_df(data):
    """
    Convert oxygen saturation JSON data to a pandas DataFrame.
    
    Args:
        data (dict): The JSON data containing oxygen saturation information
        
    Returns:
        pd.DataFrame: DataFrame with oxygen saturation data
    """
    oxygen_records = []
    
    # Extract patient ID from the header
    patient_id = data['header']['uuid']
    
    # Process each oxygen saturation entry
    for entry in data['body']['breathing']:
        record = {
            'patient_id': patient_id,
            'oxygen_saturation': entry['oxygen_saturation']['value'],
            'unit': entry['oxygen_saturation']['unit'],
            'timestamp': entry['effective_time_frame']['date_time'],
            'measurement_method': entry.get('measurement_method', '')
        }
        oxygen_records.append(record)
    
    # Convert to DataFrame
    df = pd.DataFrame(oxygen_records)
    
    # Convert timestamp to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Sort by timestamp
    df = df.sort_values('timestamp')
    
    return df

# Convert the oxygen saturation data to DataFrame
df_oxygen = convert_oxygen_saturation_json_to_df(data)

# Display information about the DataFrame
print(f"Oxygen saturation data for patient {df_oxygen['patient_id'].iloc[0]}")
print(f"Number of records: {len(df_oxygen)}")
print(f"Date range: {df_oxygen['timestamp'].min()} to {df_oxygen['timestamp'].max()}")
print("\nSample data:")
df_oxygen.head()


In [None]:
df_oxygen

In [None]:
# Find all oxygen saturation files
oxygen_files = [i for i in SourceFile_List if 'oxygen_saturation' in i and 'json' in i]
print(f"Found {len(oxygen_files)} oxygen saturation files")

# Initialize an empty list to store all oxygen saturation data
all_oxygen_data = []

# Process each oxygen saturation file
for file_path in tqdm(oxygen_files, desc="Processing oxygen saturation files"):
    try:
        # Read the JSON file
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Convert to DataFrame
        df = convert_oxygen_saturation_json_to_df(data)
        
        # Append to the list
        all_oxygen_data.append(df)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Combine all DataFrames
if all_oxygen_data:
    df_oxygen_all = pd.concat(all_oxygen_data, ignore_index=True)
    
    # Convert timestamp to datetime if not already
    df_oxygen_all['timestamp'] = pd.to_datetime(df_oxygen_all['timestamp'])
    
    # Sort by patient_id and timestamp
    df_oxygen_all = df_oxygen_all.sort_values(['patient_id', 'timestamp'])
    
    # Display information about the combined DataFrame
    print(df_oxygen_all.shape)
    
    # Show the distribution of records by patient
    print(df_oxygen_all['patient_id'].value_counts().head(10))
else:
    print("No oxygen saturation data found")


# Create a new DataFrame with standardized column names for oxygen saturation data
df_oxygen_all_processed = df_oxygen_all.copy()

# Rename columns to match standardized format
df_oxygen_all_processed = df_oxygen_all_processed.rename(columns={
    'patient_id': 'PatientID',
    'oxygen_saturation': 'OxygenValue',
    'unit': 'OxygenUnit',
    'timestamp': 'ObservationDateTime',
    'measurement_method': 'MeasurementMethod'
})

# Add additional columns
df_oxygen_all_processed['OxygenEntryID'] = range(len(df_oxygen_all_processed))
df_oxygen_all_processed['ObservationEntryDateTime'] = None
df_oxygen_all_processed['TimezoneOffset'] = None
df_oxygen_all_processed['Timezone'] = None
df_oxygen_all_processed['EntryCreatedDateTime'] = df_oxygen_all_processed['ObservationDateTime']

# Map patient IDs to timezones using the existing function
# Assuming we have the same timezone distribution as seen in previous cells
# Merge with the user timezone dataframe to get the correct timezone for each patient
df_oxygen_all_processed = pd.merge(df_oxygen_all_processed, df_user[['PatientID', 'UserTimeZone']], 
                                  on='PatientID', how='left')

# Apply the timezone offset function
df_oxygen_all_processed['TimezoneOffset'] = df_oxygen_all_processed.apply(
    lambda row: get_timezone_offset_minutes_corrected(row['ObservationDateTime'], row['UserTimeZone']), 
    axis=1
)

df_oxygen_all_processed['ObservationDateTime'] = df_oxygen_all_processed['ObservationDateTime'].dt.tz_localize(None)
df_oxygen_all_processed['EntryCreatedDateTime'] = pd.to_datetime(df_oxygen_all_processed['ObservationDateTime'])

# Display the processed DataFrame

In [None]:
df_oxygen_all_processed

In [None]:
# df = df_oxygen_all_processed

# df[df['PatientID'] == 'AIREADI-1080']

In [None]:
# df

In [None]:

# # Convert TimezoneOffset to numeric to ensure proper calculation
# df['TimezoneOffset'] = pd.to_numeric(df['TimezoneOffset'])
# # Create local datetime by adding the timezone offset
# df['DT_local'] = df['ObservationDateTime'] + pd.to_timedelta(df['TimezoneOffset'], unit='minutes')
# df

In [None]:
# dfx = df[df['PatientID'] == 'AIREADI-1080']
# dfx

In [None]:
# Plot oxygen saturation values over time for patient AIREADI-1080
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

plt.figure(figsize=(20, 6))
plt.plot(dfx['DT_local'], dfx['OxygenValue'], 'o-', color='blue', alpha=0.7)
plt.title('Oxygen Saturation Values for Patient AIREADI-1080')
plt.xlabel('Local Date/Time')
plt.ylabel('Oxygen Saturation (%)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.ylim(85, 105)  # Typical range for oxygen saturation

# Format the x-axis to show dates nicely
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=2))
plt.gcf().autofmt_xdate()  # Rotate date labels

# Add a horizontal line at 95% (normal oxygen level threshold)
plt.axhline(y=95, color='r', linestyle='--', alpha=0.5, label='Normal threshold (95%)')

plt.legend()
plt.tight_layout()
plt.show()


In [None]:
df_oxygen

## wearable activity - physical activity

In [None]:
activity_file_list = [i for i in SourceFile_List if 'activity.json' in i and 'json' in i]
activity_file = activity_file_list[0]
activity_file


with open(activity_file, 'r') as f:
    data = json.load(f)

data

In [None]:
def convert_activity_json_to_df(data):
    """
    Convert activity JSON data to a pandas DataFrame.
    
    Args:
        data (dict): The JSON data containing activity information
        
    Returns:
        pd.DataFrame: DataFrame with activity data
    """
    activity_records = []
    
    # Extract patient ID from the header
    patient_id = data['header']['uuid']
    
    # Process each activity entry
    for activity in data['body']['activity']:
        record = {
            'patient_id': patient_id,
            'activity_name': activity['activity_name'],
            'steps': activity['base_movement_quantity']['value'],
            'unit': activity['base_movement_quantity']['unit'],
            'start_time': activity['effective_time_frame']['time_interval']['start_date_time'],
            'end_time': activity['effective_time_frame']['time_interval']['end_date_time']
        }
        activity_records.append(record)
    
    # Convert to DataFrame
    df = pd.DataFrame(activity_records)
    
    # Convert timestamps to datetime
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])
    
    # Calculate duration in minutes
    df['duration_minutes'] = (df['end_time'] - df['start_time']).dt.total_seconds() / 60
    
    # Sort by start time
    df = df.sort_values('start_time')
    
    return df

# Convert the activity data to DataFrame
df_activity = convert_activity_json_to_df(data)

# Display information about the DataFrame
print(f"Activity data for patient {df_activity['patient_id'].iloc[0]}")
print(f"Number of records: {len(df_activity)}")
print(f"Date range: {df_activity['start_time'].min()} to {df_activity['end_time'].max()}")
print(f"Activity types: {df_activity['activity_name'].unique()}")
print("\nSample data:")
df_activity.head()


In [None]:
# Count the occurrences of each activity type
activity_counts = df_activity['activity_name'].value_counts()
print(activity_counts)

# Find all activity files
# activity_files = [i for i in SourceFile_List if 'activity' in i and 'json' in i]
activity_files = [i for i in SourceFile_List if 'activity.json' in i and 'json' in i]

print(f"Found {len(activity_files)} activity files")

# Initialize an empty list to store all activity data
all_activity_data = []

# Process each activity file
for file_path in tqdm(activity_files, desc="Processing activity files"):
    try:
        # Read the JSON file
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Convert to DataFrame
        df = convert_activity_json_to_df(data)
        
        # Append to the list
        all_activity_data.append(df)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Combine all DataFrames
if all_activity_data:
    df_activity_all = pd.concat(all_activity_data, ignore_index=True)
    
    # Convert timestamps to datetime if not already
    df_activity_all['start_time'] = pd.to_datetime(df_activity_all['start_time'])
    df_activity_all['end_time'] = pd.to_datetime(df_activity_all['end_time'])
    
    # Sort by patient_id and start_time
    df_activity_all = df_activity_all.sort_values(['patient_id', 'start_time'])
    
    # Display information about the combined DataFrame
    print(df_activity_all.shape)
    
    # Show the distribution of records by patient
    print(df_activity_all['patient_id'].value_counts().head(10))
    
    # Show the distribution of activity types
    print(df_activity_all['activity_name'].value_counts())
else:
    print("No activity data found")

# Create a new DataFrame with standardized column names for activity data
df_activity_all_processed = df_activity_all.copy()

# Rename columns to match standardized format
df_activity_all_processed = df_activity_all_processed.rename(columns={
    'patient_id': 'PatientID',
    'activity_name': 'ActivityName',
    'steps': 'StepCount',
    'unit': 'StepUnit',
    'start_time': 'StartDateTime',
    'end_time': 'EndDateTime',
    'duration_minutes': 'DurationMinutes'
})

# Add additional columns
df_activity_all_processed['ActivityEntryID'] = range(len(df_activity_all_processed))
df_activity_all_processed['EntryCreatedDateTime'] = df_activity_all_processed['StartDateTime']

# Merge with the user timezone dataframe to get the correct timezone for each patient
df_activity_all_processed = pd.merge(df_activity_all_processed, df_user[['PatientID', 'UserTimeZone']], 
                                    on='PatientID', how='left')

# Apply the timezone offset function
df_activity_all_processed['TimezoneOffset'] = df_activity_all_processed.apply(
    lambda row: get_timezone_offset_minutes_corrected(row['StartDateTime'], row['UserTimeZone']), 
    axis=1
)

# Convert datetime columns to timezone-naive
df_activity_all_processed['StartDateTime'] = df_activity_all_processed['StartDateTime'].dt.tz_localize(None)
df_activity_all_processed['EndDateTime'] = df_activity_all_processed['EndDateTime'].dt.tz_localize(None)
df_activity_all_processed['EntryCreatedDateTime'] = pd.to_datetime(df_activity_all_processed['StartDateTime'])

# Display the processed DataFrame
df_activity_all_processed.head()


In [None]:
# df_activity[df_activity['activity_name'] == 'sedentary']



# Convert TimezoneOffset to numeric to ensure proper calculation

df = df_activity_all_processed
df['ObservationDateTime'] = pd.to_datetime(df['StartDateTime'])
df['TimezoneOffset'] = pd.to_numeric(df['TimezoneOffset'])
# Create local datetime by adding the timezone offset
df['DT_local_start'] = df['ObservationDateTime'] + pd.to_timedelta(df['TimezoneOffset'], unit='minutes')
df['DT_local_end'] = df['EndDateTime'] + pd.to_timedelta(df['TimezoneOffset'], unit='minutes')

dfx = df[df['PatientID'] == 'AIREADI-1080']
dfx


In [None]:
# Plot activity data for patient AIREADI-1080 (random full day)
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import timedelta
import numpy as np
import pandas as pd
import random

# Set up the figure with a larger size for better visibility
plt.figure(figsize=(15, 8))

# Get all available dates for this patient
available_dates = dfx['DT_local_start'].dt.date.unique()

# Select a random full day
if len(available_dates) > 0:
    random_day = random.choice(available_dates)
    dfx_random_day = dfx[dfx['DT_local_start'].dt.date == random_day]
    
    # Define a color map for different activity types
    activity_types = dfx_random_day['ActivityName'].unique()
    colors = plt.cm.tab10(np.linspace(0, 1, len(activity_types)))
    color_map = dict(zip(activity_types, colors))
    
    # Create a legend handles list
    legend_handles = []
    
    # Plot each activity as a horizontal line from start to end time
    for idx, row in dfx_random_day.iterrows():
        activity = row['ActivityName']
        start = row['DT_local_start']
        end = row['DT_local_end']
        
        # Skip entries with identical start and end times
        if start == end:
            continue
        
        # Get color for this activity type
        color = color_map[activity]
        
        # Plot the activity as a horizontal line
        line = plt.hlines(y=activity, xmin=start, xmax=end, 
                          colors=color, linewidth=4, alpha=0.7)
        
        # Add to legend handles if not already added
        if activity not in [h.get_label() for h in legend_handles]:
            legend_handles.append(plt.Line2D([0], [0], color=color, lw=4, label=activity))
    
    # Format the x-axis to show hours nicely for a single day
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
    plt.gca().xaxis.set_major_locator(mdates.HourLocator(interval=2))
    
    # Add labels and title
    plt.xlabel('Time', fontsize=12)
    plt.ylabel('Activity Type', fontsize=12)
    plt.title(f'Activity Timeline for Patient AIREADI-1080 on {random_day}', fontsize=14)
    
    # Add a legend
    plt.legend(handles=legend_handles, loc='upper right')
    
    # Rotate time labels for better readability
    plt.xticks(rotation=45)
    
    # Adjust layout
    plt.tight_layout()
    
    # Show the plot
    plt.show()
else:
    print("No data available for plotting")


## wearable activity - physicial activity calorie

In [None]:
calorie_file_list = [i for i in SourceFile_List if 'calorie.json' in i and 'json' in i]# [0]
calorie_file_list

calorie_file = calorie_file_list[0]
calorie_file



with open(calorie_file, 'r') as f:
    data = json.load(f)

data

In [None]:
def convert_calorie_json_to_df(data):
    """
    Convert calorie JSON data to a pandas DataFrame.
    
    Parameters:
    -----------
    data : dict
        The JSON data containing calorie information.
        
    Returns:
    --------
    pandas.DataFrame
        A DataFrame with calorie data.
    """
    # Extract patient ID from the header
    patient_id = data['header']['user_id']
    
    # Initialize lists to store the data
    records = []
    
    # Extract calorie data from the body
    for activity in data['body']['activity']:
        if activity['activity_name'] == 'kcal_burned':
            record = {
                'patient_id': patient_id,
                'activity_name': activity['activity_name'],
                'calories': activity['calories_value']['value'],
                'unit': activity['calories_value']['unit'],
                'timestamp': activity['effective_time_frame']['date_time']
            }
            records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(records)
    
    # Convert timestamp to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    return df

# Convert the calorie data to DataFrame
df_calorie = convert_calorie_json_to_df(data)

# Display information about the DataFrame
print(f"Calorie data for patient {df_calorie['patient_id'].iloc[0]}")
print(f"Number of records: {len(df_calorie)}")
print(f"Date range: {df_calorie['timestamp'].min()} to {df_calorie['timestamp'].max()}")
print(f"Calorie range: {df_calorie['calories'].min()} to {df_calorie['calories'].max()} {df_calorie['unit'].iloc[0]}")
print("\nSample data:")
df_calorie.head()


## wearable activity - respiratory rate

In [None]:
respiratory_rate_file_list = [i for i in SourceFile_List if 'respiratory_rate' in i and 'json' in i]
respiratory_rate_file = respiratory_rate_file_list[0]
respiratory_rate_file


with open(respiratory_rate_file, 'r') as f:
    data = json.load(f)

data

In [None]:
def convert_respiratory_rate_json_to_df(data):
    """
    Convert respiratory rate data from JSON format to a pandas DataFrame.
    
    Args:
        data (dict): JSON data containing respiratory rate information
        
    Returns:
        pandas.DataFrame: DataFrame with respiratory rate data
    """
    records = []
    
    # Extract patient ID from the header
    patient_id = data['header']['user_id']
    
    # Process each respiratory rate record
    if 'breathing' in data['body']:
        for breathing_record in data['body']['breathing']:
            record = {
                'patient_id': patient_id,
                'respiratory_rate': breathing_record['respiratory_rate']['value'],
                'unit': breathing_record['respiratory_rate']['unit'],
                'timestamp': breathing_record['effective_time_frame']['date_time']
            }
            records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(records)
    
    # Convert timestamp to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    return df

# Convert the respiratory rate data to DataFrame
df_respiratory_rate = convert_respiratory_rate_json_to_df(data)

# Display information about the DataFrame
print(f"Respiratory rate data for patient {df_respiratory_rate['patient_id'].iloc[0]}")
print(f"Number of records: {len(df_respiratory_rate)}")
print(f"Date range: {df_respiratory_rate['timestamp'].min()} to {df_respiratory_rate['timestamp'].max()}")
print(f"Respiratory rate range: {df_respiratory_rate['respiratory_rate'].min()} to {df_respiratory_rate['respiratory_rate'].max()} {df_respiratory_rate['unit'].iloc[0]}")
print("\nSample data:")
df_respiratory_rate.head()


In [None]:
df_respiratory_rate['respiratory_rate'].value_counts()

In [None]:
# Find all respiratory rate files
respiratory_rate_file_list = [i for i in SourceFile_List if 'respiratory_rate' in i and 'json' in i]
respiratory_rate_files = respiratory_rate_file_list # [i for i in SourceFile_List if 'respiratory-rate.json' in i and 'json' in i]

print(f"Found {len(respiratory_rate_files)} respiratory rate files")

# Initialize an empty list to store all respiratory rate data
all_respiratory_rate_data = []

# Process each respiratory rate file
for file_path in tqdm(respiratory_rate_files, desc="Processing respiratory rate files"):
    try:
        # Read the JSON file
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Convert to DataFrame
        df = convert_respiratory_rate_json_to_df(data)
        
        # Append to the list
        all_respiratory_rate_data.append(df)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Combine all DataFrames
if all_respiratory_rate_data:
    df_respiratory_rate_all = pd.concat(all_respiratory_rate_data, ignore_index=True)
    
    # Convert timestamps to datetime if not already
    df_respiratory_rate_all['timestamp'] = pd.to_datetime(df_respiratory_rate_all['timestamp'])
    
    # Sort by patient_id and timestamp
    df_respiratory_rate_all = df_respiratory_rate_all.sort_values(['patient_id', 'timestamp'])
    
    # Display information about the combined DataFrame
    print(df_respiratory_rate_all.shape)
    
    # Show the distribution of records by patient
    print(df_respiratory_rate_all['patient_id'].value_counts().head(10))
    
    # Show the distribution of respiratory rates
    print(df_respiratory_rate_all['respiratory_rate'].value_counts().head(20))
else:
    print("No respiratory rate data found")

# Create a new DataFrame with standardized column names for respiratory rate data
df_respiratory_rate_all_processed = df_respiratory_rate_all.copy()

# Rename columns to match standardized format
df_respiratory_rate_all_processed = df_respiratory_rate_all_processed.rename(columns={
    'patient_id': 'PatientID',
    'respiratory_rate': 'RespiratoryRate',
    'unit': 'RespiratoryUnit',
    'timestamp': 'ObservationDateTime'
})

# Add additional columns
df_respiratory_rate_all_processed['EntryCreatedDateTime'] = df_respiratory_rate_all_processed['ObservationDateTime']

# Merge with the user timezone dataframe to get the correct timezone for each patient
df_respiratory_rate_all_processed = pd.merge(df_respiratory_rate_all_processed, df_user[['PatientID', 'UserTimeZone']], 
                                    on='PatientID', how='left')

# Apply the timezone offset function
df_respiratory_rate_all_processed['TimezoneOffset'] = df_respiratory_rate_all_processed.apply(
    lambda row: get_timezone_offset_minutes_corrected(row['ObservationDateTime'], row['UserTimeZone']), 
    axis=1
)

# Convert datetime columns to timezone-naive
df_respiratory_rate_all_processed['ObservationDateTime'] = df_respiratory_rate_all_processed['ObservationDateTime'].dt.tz_localize(None)
df_respiratory_rate_all_processed['EntryCreatedDateTime'] = pd.to_datetime(df_respiratory_rate_all_processed['ObservationDateTime'])

# Add local datetime column
df_respiratory_rate_all_processed['DT_local'] = df_respiratory_rate_all_processed.apply(
    lambda row: row['ObservationDateTime'] - pd.Timedelta(minutes=row['TimezoneOffset']), 
    axis=1
)

# Display the processed DataFrame
df_respiratory_rate_all_processed.head()



In [None]:
# Convert TimezoneOffset to numeric to ensure proper calculation
df = df_respiratory_rate_all_processed
df['ObservationDateTime'] = pd.to_datetime(df['ObservationDateTime'])
df['TimezoneOffset'] = pd.to_numeric(df['TimezoneOffset'])

# Create local datetime by adding the timezone offset
df['DT_local'] = df['ObservationDateTime'] + pd.to_timedelta(df['TimezoneOffset'], unit='minutes')

# Filter for a specific patient to examine their data
dfx = df[df['PatientID'] == 'AIREADI-1080']
dfx.head()


In [None]:
# Create a figure to visualize respiratory rate data for AIREADI-1080
plt.figure(figsize=(15, 8))

# Filter out invalid respiratory rate values (negative values)
valid_data = dfx[dfx['RespiratoryRate'] > 0]

# Plot the respiratory rate over time
plt.plot(valid_data['DT_local'], valid_data['RespiratoryRate'], 'o-', markersize=4, alpha=0.7)

# Add labels and title
plt.xlabel('Date and Time (Local)', fontsize=12)
plt.ylabel('Respiratory Rate (breaths/min)', fontsize=12)
plt.title(f'Respiratory Rate Over Time for Patient {valid_data["PatientID"].iloc[0]}', fontsize=14)

# Add grid for better readability
plt.grid(True, alpha=0.3)

# Format x-axis to show dates clearly
plt.gcf().autofmt_xdate()

# Add a horizontal line at the normal respiratory rate range (12-20 breaths/min)
plt.axhline(y=12, color='g', linestyle='--', alpha=0.5, label='Lower normal range (12)')
plt.axhline(y=20, color='r', linestyle='--', alpha=0.5, label='Upper normal range (20)')

# Add legend
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()


## wearable activity - stress

In [None]:
stress_file_list = [i for i in SourceFile_List if 'stress' in i and 'json' in i]
stress_file = stress_file_list[0]
stress_file


with open(stress_file, 'r') as f:
    data = json.load(f)


data

In [None]:
def convert_stress_json_to_df(data):
    """
    Convert stress data from JSON format to a pandas DataFrame.
    
    Args:
        data (dict): JSON data containing stress information
        
    Returns:
        pandas.DataFrame: DataFrame with stress data
    """
    records = []
    
    # Extract patient ID from the header
    patient_id = data['header']['user_id']
    
    # Process each stress record
    if 'stress' in data['body']:
        for stress_record in data['body']['stress']:
            record = {
                'patient_id': patient_id,
                'stress_level': stress_record['stress']['value'],
                'unit': stress_record['stress']['unit'],
                'timestamp': stress_record['effective_time_frame']['date_time']
            }
            records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(records)
    
    # Convert timestamp to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    return df

# Convert the stress data to DataFrame
df_stress = convert_stress_json_to_df(data)

# Display information about the DataFrame
print(f"Stress data for patient {df_stress['patient_id'].iloc[0]}")
print(f"Number of records: {len(df_stress)}")
print(f"Date range: {df_stress['timestamp'].min()} to {df_stress['timestamp'].max()}")
print(f"Stress level range: {df_stress['stress_level'].min()} to {df_stress['stress_level'].max()} {df_stress['unit'].iloc[0]}")
print("\nSample data:")
df_stress.head()


In [None]:
# Find all stress files
stress_file_list = [i for i in SourceFile_List if 'stress' in i and 'json' in i]
stress_files = stress_file_list

print(f"Found {len(stress_files)} stress files")

# Initialize an empty list to store all stress data
all_stress_data = []

# Process each stress file
for file_path in tqdm(stress_files, desc="Processing stress files"):
    try:
        # Read the JSON file
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Convert to DataFrame
        df = convert_stress_json_to_df(data)
        
        # Append to the list
        all_stress_data.append(df)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Combine all DataFrames
if all_stress_data:
    df_stress_all = pd.concat(all_stress_data, ignore_index=True)
    
    # Convert timestamps to datetime if not already
    df_stress_all['timestamp'] = pd.to_datetime(df_stress_all['timestamp'])
    
    # Sort by patient_id and timestamp
    df_stress_all = df_stress_all.sort_values(['patient_id', 'timestamp'])
    
    # Display information about the combined DataFrame
    print(df_stress_all.shape)
    
    # Show the distribution of records by patient
    print(df_stress_all['patient_id'].value_counts().head(10))
    
    # Show the distribution of stress levels
    print(df_stress_all['stress_level'].value_counts().head(20))
else:
    print("No stress data found")

# Create a new DataFrame with standardized column names for stress data
df_stress_all_processed = df_stress_all.copy()

# Rename columns to match standardized format
df_stress_all_processed = df_stress_all_processed.rename(columns={
    'patient_id': 'PatientID',
    'stress_level': 'StressLevel',
    'unit': 'StressUnit',
    'timestamp': 'ObservationDateTime'
})

# Add additional columns
df_stress_all_processed['EntryCreatedDateTime'] = df_stress_all_processed['ObservationDateTime']

# Merge with the user timezone dataframe to get the correct timezone for each patient
df_stress_all_processed = pd.merge(df_stress_all_processed, df_user[['PatientID', 'UserTimeZone']], 
                                    on='PatientID', how='left')

# Apply the timezone offset function
df_stress_all_processed['TimezoneOffset'] = df_stress_all_processed.apply(
    lambda row: get_timezone_offset_minutes_corrected(row['ObservationDateTime'], row['UserTimeZone']), 
    axis=1
)

# Convert datetime columns to timezone-naive
df_stress_all_processed['ObservationDateTime'] = df_stress_all_processed['ObservationDateTime'].dt.tz_localize(None)
df_stress_all_processed['EntryCreatedDateTime'] = pd.to_datetime(df_stress_all_processed['ObservationDateTime'])

# Add local datetime column
df_stress_all_processed['DT_local'] = df_stress_all_processed.apply(
    lambda row: row['ObservationDateTime'] - pd.Timedelta(minutes=row['TimezoneOffset']), 
    axis=1
)

# Display the processed DataFrame
df_stress_all_processed.head()

## wearable activity - sleep

In [None]:
sleep_file_list = [i for i in SourceFile_List if 'sleep' in i and 'json' in i]
sleep_file = sleep_file_list[0]
sleep_file


with open(sleep_file, 'r') as f:
    data = json.load(f)

data

In [None]:
def convert_sleep_json_to_df(data):
    """
    Convert sleep JSON data to a pandas DataFrame.
    
    Args:
        data (dict): The JSON data containing sleep information
        
    Returns:
        pd.DataFrame: A DataFrame with sleep stage information
    """
    records = []
    
    # Extract patient ID from the header
    patient_id = data['header']['user_id']
    
    # Process each sleep record
    for sleep_record in data['body']['sleep']:
        record = {
            'patient_id': patient_id,
            'sleep_stage': sleep_record['sleep_stage_state'],
            'start_time': sleep_record['sleep_stage_time_frame']['time_interval']['start_date_time'],
            'end_time': sleep_record['sleep_stage_time_frame']['time_interval']['end_date_time']
        }
        records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(records)
    
    # Convert timestamps to datetime
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])
    
    # Calculate duration in minutes
    df['duration_minutes'] = (df['end_time'] - df['start_time']).dt.total_seconds() / 60
    
    return df

# Convert the sleep data to DataFrame
df_sleep = convert_sleep_json_to_df(data)

# Display information about the DataFrame
print(f"Sleep data for patient {df_sleep['patient_id'].iloc[0]}")
print(f"Number of records: {len(df_sleep)}")
print(f"Date range: {df_sleep['start_time'].min()} to {df_sleep['end_time'].max()}")
print(f"Sleep stages: {df_sleep['sleep_stage'].unique()}")
print(f"Total sleep duration: {df_sleep['duration_minutes'].sum():.2f} minutes")
print("\nSample data:")
df_sleep.head()

# df_sleep['sleep_stage'].value_counts()

In [None]:
# Find all sleep files
sleep_file_list = [i for i in SourceFile_List if 'sleep' in i and 'json' in i]
sleep_files = sleep_file_list

print(f"Found {len(sleep_files)} sleep files")

# Initialize an empty list to store all sleep data
all_sleep_data = []

# Process each sleep file
for file_path in tqdm(sleep_files, desc="Processing sleep files"):
    try:
        # Read the JSON file
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Convert to DataFrame
        df = convert_sleep_json_to_df(data)
        
        # Append to the list
        all_sleep_data.append(df)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Combine all DataFrames
if all_sleep_data:
    df_sleep_all = pd.concat(all_sleep_data, ignore_index=True)
    
    # Convert timestamps to datetime if not already
    df_sleep_all['start_time'] = pd.to_datetime(df_sleep_all['start_time'])
    df_sleep_all['end_time'] = pd.to_datetime(df_sleep_all['end_time'])
    
    # Sort by patient_id and timestamp
    df_sleep_all = df_sleep_all.sort_values(['patient_id', 'start_time'])
    
    # Display information about the combined DataFrame
    print(df_sleep_all.shape)
    
    # Show the distribution of records by patient
    print(df_sleep_all['patient_id'].value_counts().head(10))
    
    # Show the distribution of sleep stages
    print(df_sleep_all['sleep_stage'].value_counts().head(20))
else:
    print("No sleep data found")

# Create a new DataFrame with standardized column names for sleep data
df_sleep_all_processed = df_sleep_all.copy()

# Rename columns to match standardized format
df_sleep_all_processed = df_sleep_all_processed.rename(columns={
    'patient_id': 'PatientID',
    'sleep_stage': 'SleepStage',
    'start_time': 'StartDateTime',
    'end_time': 'EndDateTime',
    'duration_minutes': 'DurationMinutes'
})

# Add additional columns
df_sleep_all_processed['EntryCreatedDateTime'] = df_sleep_all_processed['StartDateTime']

# Merge with the user timezone dataframe to get the correct timezone for each patient
df_sleep_all_processed = pd.merge(df_sleep_all_processed, df_user[['PatientID', 'UserTimeZone']], 
                                  on='PatientID', how='left')

# Apply the timezone offset function
df_sleep_all_processed['TimezoneOffset'] = df_sleep_all_processed.apply(
    lambda row: get_timezone_offset_minutes_corrected(row['StartDateTime'], row['UserTimeZone']), 
    axis=1
)

# Convert datetime columns to timezone-naive
df_sleep_all_processed['StartDateTime'] = df_sleep_all_processed['StartDateTime'].dt.tz_localize(None)
df_sleep_all_processed['EndDateTime'] = df_sleep_all_processed['EndDateTime'].dt.tz_localize(None)
df_sleep_all_processed['EntryCreatedDateTime'] = pd.to_datetime(df_sleep_all_processed['StartDateTime'])

# Add local datetime column
df_sleep_all_processed['DT_local_start'] = df_sleep_all_processed.apply(
    lambda row: row['StartDateTime'] - pd.Timedelta(minutes=row['TimezoneOffset']), 
    axis=1
)

df_sleep_all_processed['DT_local_end'] = df_sleep_all_processed.apply(
    lambda row: row['EndDateTime'] - pd.Timedelta(minutes=row['TimezoneOffset']), 
    axis=1
)

# Display the processed DataFrame
df_sleep_all_processed.head()


## cardiac ecg

In [None]:
import numpy as np 

ecg_file_list = [i for i in SourceFile_List if 'ecg' in i and 'dat' in i]
ecg_file = ecg_file_list[0]
ecg_file


ecg_file_head = ecg_file.replace('.dat', '.hea')
ecg_file_head



with open(ecg_file_head, 'r') as f:
    ecg_file_head_lines = f.readlines()

date = [i for i in ecg_file_head_lines if 'validation_date' in i]
date

ecg_file_head_lines


In [None]:
# Extract meaningful information from ECG header file
def extract_ecg_metadata(header_lines):
    """
    Extract meaningful metadata from ECG header file lines
    """
    metadata = {}
    
    # # Extract basic information from the first line
    # if len(header_lines) > 0 and ' ' in header_lines[0]:
    #     parts = header_lines[0].strip().split(' ')
    #     if len(parts) >= 4:
    #         metadata['record_name'] = parts[0]
    #         metadata['n_channels'] = int(parts[1])
    #         metadata['sample_rate'] = int(parts[2])
    #         metadata['n_samples'] = int(parts[3])
    
    # Extract channel information
    # channels = []
    # for i in range(1, 13):  # Assuming 12-lead ECG
    #     if i < len(header_lines) and '.dat' in header_lines[i]:
    #         channel_parts = header_lines[i].strip().split(' ')
    #         if len(channel_parts) >= 9:
    #             channel = {
    #                 'file': channel_parts[0],
    #                 'format': channel_parts[1],
    #                 'gain': channel_parts[2],
    #                 'bits': channel_parts[3],
    #                 'offset': channel_parts[4],
    #                 'initial_value': channel_parts[5],
    #                 'checksum': channel_parts[6],
    #                 'block_size': channel_parts[7],
    #                 'name': channel_parts[8]
    #             }
    #             channels.append(channel)
    
    # metadata['channels'] = channels
    
    # Extract comments and other metadata
    for line in header_lines:
        if line.startswith('#'):
            line = line.strip('# \n')
            if ':' in line:
                key, value = line.split(':', 1)
                metadata[key.strip()] = value.strip()
    
    return metadata

# Extract metadata from the ECG header file
ecg_metadata = extract_ecg_metadata(ecg_file_head_lines)
ecg_metadata


In [None]:

# Function to load ECG data from .dat file
def load_ecg_data(file_path):
    try:
        # Check if the file exists
        if not os.path.exists(file_path):
            print(f"ECG file not found: {file_path}")
            return None
        
        # Import wfdb library for reading ECG data
        import wfdb
        
        # Get the file path without extension
        file_base = os.path.splitext(file_path)[0]
        
        # Read the ECG record
        record = wfdb.rdrecord(file_base)
        
        # Extract data and metadata
        ecg_data = {
            'signal': record.p_signal,
            'sample_rate': record.fs,
            'channels': record.sig_name,
            'units': record.units,
            'patient_id': record.record_name,
            'n_samples': record.sig_len,
            'base_time': record.base_time,
            'base_date': record.base_date
        }
        
        print(f"Successfully loaded ECG data with {ecg_data['n_samples']} samples")
        print(f"Sample rate: {ecg_data['sample_rate']} Hz")
        print(f"Channels: {ecg_data['channels']}")
        
        return ecg_data
    
    except Exception as e:
        print(f"Error loading ECG data: {str(e)}")
        return None

# Try to load the ECG data
ecg_data = load_ecg_data(ecg_file)

# If successful, display a sample of the data
if ecg_data is not None:
    import matplotlib.pyplot as plt
    
    # Plot a small segment of the first channel
    plt.figure(figsize=(12, 4))
    time_in_seconds = np.arange(min(1000, ecg_data['n_samples'])) / ecg_data['sample_rate']
    plt.plot(time_in_seconds, ecg_data['signal'][:min(1000, ecg_data['n_samples']), 0])
    plt.title(f"ECG Data - Channel: {ecg_data['channels'][0]}")
    plt.xlabel("Time (seconds)")
    plt.ylabel(f"Amplitude ({ecg_data['units'][0]})")
    plt.grid(True)
    plt.show()


In [None]:
# Create a function to display all ECG data in a more comprehensive way
def display_ecg_data(ecg_data):
    """
    Display all channels of ECG data in a multi-panel plot
    
    Args:
        ecg_data (dict): Dictionary containing ECG data and metadata
    """
    if ecg_data is None:
        print("No ECG data to display")
        return
    
    # Get the number of channels
    n_channels = len(ecg_data['channels'])
    
    # Create a figure with subplots arranged in a 6x2 grid
    fig, axes = plt.subplots(6, 2, figsize=(15, 24))
    # Flatten the axes array for easier indexing
    axes = axes.flatten()
    
    # Plot each channel
    for i, channel in enumerate(ecg_data['channels']):
        if i < n_channels:  # Make sure we don't exceed the number of channels
            # Calculate time in seconds for x-axis
            time_in_seconds = np.arange(ecg_data['n_samples']) / ecg_data['sample_rate']
            
            # Plot the data
            axes[i].plot(time_in_seconds, ecg_data['signal'][:, i])
            axes[i].set_title(f"Channel: {channel}")
            axes[i].set_ylabel(f"Amplitude ({ecg_data['units'][i]})")
            axes[i].grid(True)
    
    # Hide any unused subplots
    for i in range(n_channels, len(axes)):
        axes[i].set_visible(False)
    
    # Add a common x-axis label
    plt.xlabel("Time (seconds)")
    plt.tight_layout()
    plt.show()
    
    # Display metadata
    print("\nECG Metadata:")
    print(f"Patient ID: {ecg_data['patient_id']}")
    print(f"Sample Rate: {ecg_data['sample_rate']} Hz")
    print(f"Number of Samples: {ecg_data['n_samples']}")
    print(f"Duration: {ecg_data['n_samples']/ecg_data['sample_rate']:.2f} seconds")
    print(f"Base Time: {ecg_data['base_time']}")
    print(f"Base Date: {ecg_data['base_date']}")
    print(f"Channels: {', '.join(ecg_data['channels'])}")

# Display all ECG data
display_ecg_data(ecg_data)


In [None]:
ecg_data['signal'].shape

In [None]:
ecg_data#['signal']

## process_Source_to_Raw

In [None]:
# %%%%%%%%%%%%%%%%%%%%% user
def process_Source_to_Raw(OneCohort_Args, SourceFile_List, get_RawName_from_SourceFile,SPACE):
    

    RawName_to_dfRaw = {}


    # ---------------------- Patient ----------------------
    participants_xml_file = [i for i in SourceFile_List if 'participants' in i][0]
    # print(participants_xml_file)

    participants_tsv_file = participants_xml_file.replace('.json', '.tsv')

    df = pd.read_csv(participants_tsv_file, sep='\t')
    df['YearOfBirth'] = pd.to_datetime(df['study_visit_date']).dt.year - df['age']
    # df
    # df['study_group'].value_counts()

    clinical_site_to_timezone = {
        'UAB': 'America/Chicago',
        'UW': 'America/Los_Angeles',
        'UCSD': 'America/Los_Angeles',
    }

    df['UserTimeZone'] = df['clinical_site'].map(clinical_site_to_timezone)



    df['DiseaseType'] = df['study_group'].map({
        'healthy': 0, 
        'pre_diabetes_lifestyle_controlled': 0.5, 
        'oral_medication_and_or_non_insulin_injectable_medication_controlled': 2, 
        'insulin_dependent': 2})

    df['MRSegmentID'] = df['study_group']

    RawName = 'Patient'
    raw_columns = ['PatientID', 
                'MRSegmentID', 'MRSegmentModifiedDateTime', 'DiseaseType',
                                'Gender', 'ActivationDate', 'UserTimeZoneOffset', 'UserTimeZone',
                                'Description', 'YearOfBirth']
    df = df.rename(columns = {'participant_id': 'PatientID'})
    df = df.reindex(columns = raw_columns)
    df['UserTimeZoneOffset'] = 0

    # person_file = [i for i in SourceFile_List if 'person' in i][0]
    # person_file

    # df_person = pd.read_csv(person_file)
    # df_person.head()

    df_user = df
    df_user['PatientID'] = 'AIREADI-' + df_user['PatientID'].astype(str)
    RawName_to_dfRaw[RawName] = df_user
    print(df_user.shape, df_user['PatientID'].nunique())



    # ---------------------- CGM ----------------------
    # all_cgm_data = []
    # cgm_json_list = [i for i in SourceFile_List if 'wearable_blood_glucose' in i and  'json' in i]
    # print(len(cgm_json_list))

    # for file_path in cgm_json_list: # tqdm(, desc="Processing CGM files"):
    #     try:
    #         # Extract patient ID from the file path
    #         patient_id = os.path.basename(os.path.dirname(file_path))
            
    #         # Convert the JSON file to DataFrame
    #         df = convert_cgm_json_to_df(file_path)
            
    #         # Add patient ID if not already in the DataFrame
    #         if 'patient_id' not in df.columns:
    #             df['patient_id'] = patient_id
                
    #         all_cgm_data.append(df)
    #     except Exception as e:
    #         print(f"Error processing {file_path}: {e}")

    # final_cgm_df = pd.concat(all_cgm_data, ignore_index=True)

    # RawName = 'ElogBGEntry'
    # raw_columns = ['BGEntryID', 'PatientID', 'ObservationDateTime', 
    #                'BGValue',
    #                 # 'IsNormalIndicator', 
    #                 'ObservationEntryDateTime', 'TimezoneOffset',
    #                 'Timezone', 'EntryCreatedDateTime', # 'ActualBGValue',
    #                 'ExternalSourceID', 
    #                 'UserObservationDateTime']

    # df = final_cgm_df
    # df = df.rename(columns = {'patient_id': 'PatientID', 'glucose_value': 'BGValue', 'timestamp': 'ObservationDateTime'})
    # df = df.reindex(columns = raw_columns)
    # df = pd.merge(df, df_user[['PatientID', 'UserTimeZone']], on = 'PatientID', how = 'left')
    # df['BGEntryID'] = df.index
    # df['TimezoneOffset'] = df.apply(lambda row: get_timezone_offset_minutes_corrected(
    #                                                 row['ObservationDateTime'], 
    #                                                 row['UserTimeZone']), 
    #                                                 axis = 1)
    
    
    # # df['TimezoneOffset'] = None
    # df['ExternalSourceID'] = 18
    
    # df['ObservationDateTime'] = df['ObservationDateTime'].dt.tz_localize(None)
    # df['EntryCreatedDateTime'] = pd.to_datetime(df['ObservationDateTime'])

    # df['BGValue'] = pd.to_numeric(df['BGValue'], errors='coerce')
    # df = df[df['BGValue'].notna()].reset_index(drop = True)

    # df_cgm = df 
    # RawName_to_dfRaw[RawName] = df_cgm
    # print('cgm patients', df_cgm.shape, df_cgm['PatientID'].nunique())
    
    
    # ---------------------- hr ----------------------
    all_hr_data = []
    hr_json_list = [i for i in SourceFile_List if 'heart_rate' in i and  'json' in i]
    print(len(hr_json_list))


    for file_path in hr_json_list: # tqdm(, desc="Processing HR files"):
        try:
            # Extract patient ID from the file path
            patient_id = os.path.basename(os.path.dirname(file_path))
            
            # Convert the JSON file to DataFrame
            with open(file_path, 'r') as f:
                data = json.load(f)
            df = convert_heart_rate_json_to_df(data)
            
            # Add patient ID if not already in the DataFrame
            if 'patient_id' not in df.columns:
                df['patient_id'] = patient_id
                
            all_hr_data.append(df)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    df = pd.concat(all_hr_data, ignore_index=True)

    RawName = 'HeartRate'
    #originalname: 'wearable_blood_glucose'
    raw_columns =['HREntryID', 'PatientID', 'ObservationDateTime', 'HRValue','HRUnit',
                'ObservationEntryDateTime', 'TimezoneOffset',
                'Timezone', 'EntryCreatedDateTime'
                ]
    df = df.rename(columns = {'patient_id': 'PatientID', 
                            'timestamp': 'ObservationDateTime',
                            'heart_rate': 'HRValue',
                            'unit':'HRUnit',
                            })
    df = df.reindex(columns = raw_columns)
    df['HREntryID'] = df.index
    df = pd.merge(df, df_user[['PatientID', 'UserTimeZone']], on = 'PatientID', how = 'left') 
    df['TimezoneOffset'] = df.apply(lambda row: get_timezone_offset_minutes_corrected(
                                                        row['ObservationDateTime'], 
                                                        row['UserTimeZone']), 
                                                        axis = 1)
    df['ObservationDateTime'] = df['ObservationDateTime'].dt.tz_localize(None)
    df['EntryCreatedDateTime'] = pd.to_datetime(df['ObservationDateTime'])

    df['HRValue'] = pd.to_numeric(df['HRValue'], errors='coerce')
    df = df[df['HRValue'].notna()].reset_index(drop = True)

    df_hr = df 
    RawName_to_dfRaw[RawName] = df_hr
    print('hr patients',df_hr.shape, df_hr['PatientID'].nunique())

    # ---------------------- Save to CSV ----------------------
    for RawName, df in RawName_to_dfRaw.items():
        print(RawName, df.shape)
        print(df.columns)
        # display(df.head())

        path = os.path.join(OneCohort_Args['FolderPath'], f'processed_RawFile_{RawName}.csv')
        print(path)
        df.to_csv(path, index=False)
        RawName_to_dfRaw[RawName] = path# .replace(SPACE['DATA_RAW'], '$DATA_RAW$')

    return RawName_to_dfRaw

process_Source_to_Raw.fn_string = inspect.getsource(process_Source_to_Raw)
# %%%%%%%%%%%%%%%%%%%%% user


In [None]:
RawName_to_dfRaw = process_Source_to_Raw(OneCohort_Args, SourceFile_List, get_RawName_from_SourceFile,SPACE)    
RawName_to_dfRaw

# Step 5: Save Cohort Fn

In [None]:
# Get the python file path from the cohort object 
pypath = cohort.pypath
pypath

In [None]:
prefix = [
    'import os',
    'import pandas as pd', 
    'import numpy as np'
    ]
prefix

In [None]:
# List of variables to be included in the generated script
iterative_variables = [OneCohort_Args, SourceFile_SuffixList]
iterative_variables

In [None]:
# from recfldtkn.base import Base
# fn_variables = [
#     convert_cgm_json_to_df,
#     get_RawName_from_SourceFile, 
#     process_Source_to_Raw,
    
#     ]
# pycode= Base.convert_variables_to_pystirng(iterative_variables = iterative_variables, 
#                                            fn_variables = fn_variables, 
#                                            prefix = prefix)
# # print(pycode)


In [None]:
# with open(pypath, 'w') as file: file.write(pycode)

In [None]:
from recfldtkn.base import Base 
# Get the python file path from the cohort object 
pypath = cohort.pypath

# Define the import statements to be included at the begining
prefix = [
    'import os',
    'import pandas as pd', 
    'import numpy as np',
    ]
# List of variables to be included in the generated script
iterative_variables = [OneCohort_Args, SourceFile_SuffixList]
# list of the funcitons to be included in the generated script
fn_variables = [
    convert_cgm_json_to_df,
    get_timezone_offset_minutes_corrected,
    get_RawName_from_SourceFile, 
    process_Source_to_Raw]
pycode = Base.convert_variables_to_pystirng(iterative_variables = iterative_variables, 
                                           fn_variables = fn_variables, 
                                           prefix = prefix)
# Create the directory for the Python file if it doesn't exist
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))

# print(pypath)
with open(pypath, 'w') as file: file.write(pycode)
# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)

display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

# Step 6: Test 

In [None]:
from recfldtkn.record_base.cohort import CohortFn, Cohort
from config.config_record.Cohort import CohortName_to_OneCohortArgs
CohortNames = [i for i in CohortName_to_OneCohortArgs.keys()]

# # %%%%%%%%%%%%%%%%%%%%% user
# CohortName = 'WellDoc2022CGM'
# CohortName = 'WellDoc2023CVSDeRx'
CohortName = 'aireadi-noimage-v2'
# # %%%%%%%%%%%%%%%%%%%%% 
OneCohort_Args = CohortName_to_OneCohortArgs[CohortName]
OneCohort_Args

In [None]:
Source2CohortName = OneCohort_Args['Source2CohortName']
cohort_fn = CohortFn(Source2CohortName, SPACE)
cohort_fn

In [None]:
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)
cohort

In [None]:
cohort.setup_fn(cohort_fn)

In [None]:
from recfldtkn.record_base.cohort import CohortFn, Cohort

Source2CohortName = OneCohort_Args['Source2CohortName']
cohort_fn = CohortFn(Source2CohortName, SPACE)
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)
cohort.setup_fn(cohort_fn)

cohort.initialize_cohort(load_data=False)

In [None]:
cohort.RawName_to_dfRaw

In [None]:
cohort.SourceFile_List

In [None]:
cohort.RawName_to_dfRaw