In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

In [16]:
def extract_demographics(df, n, a):
    # Initialize dictionary to hold demographic data with placeholders for easier mapping
    demographic_data = {
        'ID': n,
        'Expert' : a,
        'Date': None,
        'Height_in': None,
        'Height_cm': None,
        'Weight_lbs': None,
        'Weight_kg': None,
        'Age': None,
        'Sex': None
    }
    if a == 2:
        return demographic_data

    empty = False

    for col in df.columns[-15:]:
        for index in range(20):
            cell_value = str(df.at[index, col]).strip()
            if 'DATE:' in cell_value:
                    col_index = df.columns.get_loc(col)
                    
                    # Extract and convert Date to datetime
                    date_str = df.iat[index, col_index + 1]
                    demographic_data['Date'] = pd.to_datetime(date_str, errors='coerce')
                    
                    height_in_str = df.iat[index + 1, col_index + 1]
                    weight_lbs_str = df.iat[index + 3, col_index + 1]


                    if (len(height_in_str) != 5):
                        empty = True
                        print(f"Failed to extract demographic data from Subgroup_I_{n}_{a}.xlsx -- Height = {height_in_str} - Weight = {weight_lbs_str}")
                    
                    if not empty:
                        try:
                            demographic_data['Height_in'] = float(height_in_str.split()[0]) if height_in_str else None
                            
                            # Extract Height in cm, remove 'cm', and convert to float
                            height_cm_str = df.iat[index + 2, col_index + 1]
                            demographic_data['Height_cm'] = float(height_cm_str.split()[0]) if height_cm_str else None
                                                    
                            # Extract Weight in lbs, remove 'lbs', and convert to float
                            weight_lbs_str = df.iat[index + 3, col_index + 1]
                            demographic_data['Weight_lbs'] = float(weight_lbs_str.split()[0]) if weight_lbs_str else None
                        
                            # Extract Weight in kg, remove 'kg', and convert to float
                            weight_kg_str = df.iat[index + 4, col_index + 1]
                            demographic_data['Weight_kg'] = float(weight_kg_str.split()[0]) if (weight_kg_str and len(weight_kg_str) > 2) else None
                            
                            # Extract Age and convert to integer
                            age_str = df.iat[index + 5, col_index + 1]
                            demographic_data['Age'] = int(age_str)
                            
                            # Extract Sex
                            demographic_data['Sex'] = df.iat[index + 6, col_index + 1].strip()
                        
                        except Exception as e:
                            print(f"Failed to extract demographic data from Subgroup_I_{n}_{a}.xlsx - {e}")

                    
                    break  # Break the loop once all demographic data is extracted
        if demographic_data['Date'] is not None:
            break



    return demographic_data


# Initialize a list to store all records
all_records = []

for n in range(1, 101):  
    for a in range(1, 3):  
        ##################### ADJUST PATH ###############################
        file_name = f"Data/Subgroup_I_{n}_{a}.xlsx"
        try:
            df = pd.read_excel(file_name)
            
            # Convert 'SpO2' and 'HR' columns to numeric, coercing errors to NaN
            df['SpO2'] = pd.to_numeric(df['SpO2'], errors='coerce')
            df['HR'] = pd.to_numeric(df['HR'], errors='coerce')

            # Convert "Events" to a categorical type
            df['Events'] = df['Events'].astype('category')
            
            # Extract demographic information
            demographics = extract_demographics(df, n, a)  # Assuming this function is defined as needed
            
            # Filter the DataFrame based on 'SpO2' and 'HR'
            filtered_df = df[(df['SpO2'] > 0) & (df['HR'] > 0)]
            
            # Calculate total epochs, average SpO2, and average HR
            tot_Epoch = filtered_df['Epoch'].count()
            avg_SpO2 = filtered_df['SpO2'].mean()
            avg_HR = filtered_df['HR'].mean()
            min_SpO2 = filtered_df['SpO2'].min()
            min_HR = filtered_df['HR'].min()
            max_SpO2 = filtered_df['SpO2'].max()
            max_HR = filtered_df['HR'].max()
            
            # Summarize 'BPOS', 'Stage', and now 'Events' as well
            bpos_counts = filtered_df['BPOS'].value_counts().to_dict()
            stage_counts = filtered_df['Stage'].value_counts().to_dict()
            events_summary = filtered_df['Events'].value_counts().to_dict()  
            
            # Combine all extracted data into a record
            record = {
                **demographics,
                
                'Total_Epoch': tot_Epoch,
                'Min_SpO2': min_SpO2,
                'Min_HR': min_HR,
                'Max_SpO2': max_SpO2,
                'Max_HR': max_HR,
                'Average_SpO2': avg_SpO2,
                'Average_HR': avg_HR,
                'BPOS_Summary': bpos_counts,
                'Stage_Summary': stage_counts,
                'Events_Summary': events_summary  
            }
            all_records.append(record)
        except Exception as e:
            print(f"Failed to process {file_name}: {e}")

# Convert the list of dictionaries to a DataFrame
final_df = pd.DataFrame(all_records)
print(final_df.head())

# Specify the desired output CSV file path
# output_csv_path = 'summary_data_subgroup_3.csv'
# final_df.to_csv(output_csv_path, index=False)

# print(f"Data successfully written to {output_csv_path}")

Failed to extract demographic data from Subgroup_I_7_1.xlsx -- Height = 0 in - Weight = nan
Failed to extract demographic data from Subgroup_I_10_1.xlsx -- Height = in - Weight = lbs
Failed to extract demographic data from Subgroup_I_17_1.xlsx -- Height = 180 in - Weight = 90 lbs
Failed to extract demographic data from Subgroup_I_21_1.xlsx -- Height = in - Weight = lbs
Failed to extract demographic data from Subgroup_I_23_1.xlsx -- Height = 152 in - Weight = 116 lbs
Failed to extract demographic data from Subgroup_I_27_1.xlsx -- Height = in - Weight = lbs
Failed to extract demographic data from Subgroup_I_32_1.xlsx -- Height = in - Weight = lbs
Failed to extract demographic data from Subgroup_I_33_1.xlsx -- Height = 1 in - Weight = 311 lbs
Failed to extract demographic data from Subgroup_I_41_1.xlsx - 'float' object has no attribute 'strip'
Failed to extract demographic data from Subgroup_I_48_1.xlsx -- Height = in - Weight = lbs
Failed to extract demographic data from Subgroup_I_49_1.

In [17]:
pd.DataFrame.from_dict(final_df["Events_Summary"])

Unnamed: 0,Events_Summary
0,"{'AR': 40, 'CA': 40, 'OH': 37, 'AR,CA': 26, 'M..."
1,"{'AR': 80, 'Awake': 19, 'AR,MChg': 1, 'MChg': ..."
2,"{'AR': 54, 'PLM': 27, 'Awake': 21, 'MH': 10, '..."
3,"{'AR': 150, 'Awake': 13, 'AR,AR': 6, 'AR,Awake..."
4,"{'AR': 78, 'Awake': 27, 'PLM': 7, 'PLM,AR': 6,..."
...,...
195,"{'AR': 65, 'MChg': 5, 'Awake': 4, 'AR,AR': 1, ..."
196,"{'AR': 26, 'PLM': 15, 'Awake': 7, 'MH,AR': 6, ..."
197,"{'AR': 57, 'Awake': 12, 'AR,AR': 3, 'L Out': 1..."
198,"{'PLM': 87, 'AR': 65, 'MH': 40, 'MH,AR': 24, '..."


In [19]:
# Normalize the JSON data into a flat table
df_normalized = pd.json_normalize(final_df['Events_Summary'])

# If you want to add an index column
df_normalized.reset_index(inplace=True)
df_normalized.rename(columns={'index': 'Row Index'}, inplace=True)

# Export the normalized DataFrame to a CSV file
df_normalized.to_csv('event_stage_output.csv', index=False)