In [1]:
import xml.etree.ElementTree as ET
from collections import defaultdict

import pandas as pd

from constants import FileDirectory
from utility.file_manager import FileManager

In [2]:
from constants import Apple

file_manager = FileManager()

tree = file_manager.load_file(FileDirectory.SOURCE_DATA_PATH, Apple.XML_DATA, "xml")
root = tree.getroot()

[2024-07-11 13:37:02] [INFO]  Successfully loaded file from /Users/hadid/Library/Mobile Documents/com~apple~CloudDocs/Shared/ETL/apple_health.xml


In [3]:
unique_elements = {elem.tag for elem in root.iter()}
print("Unique elements in the XML:", unique_elements)
print("Number of unique elements in the XML:", len(unique_elements))

Unique elements in the XML: {'WorkoutStatistics', 'InstantaneousBeatsPerMinute', 'MetadataEntry', 'WorkoutEvent', 'ActivitySummary', 'Workout', 'HeartRateVariabilityMetadataList', 'HealthData', 'ExportDate', 'FileReference', 'Me', 'WorkoutActivity', 'WorkoutRoute', 'Record'}
Number of unique elements in the XML: 14


In [5]:
records = [record.attrib for record in root.iter("Record")]  # Creates a list
records_df = pd.DataFrame(records)  # Creates a dataframe

# records_df[Apple.TYPE_FIELD] = (
#     records_df[Apple.TYPE_FIELD].str.split("Identifier").str[-1]
# )
print(
    "Unique values in the 'type' column after splitting:", records_df["type"].unique()
)

Unique values in the 'type' column after splitting: ['HKQuantityTypeIdentifierBloodGlucose'
 'HKQuantityTypeIdentifierDietaryWater'
 'HKQuantityTypeIdentifierBodyMassIndex'
 'HKQuantityTypeIdentifierBodyMass' 'HKQuantityTypeIdentifierHeartRate'
 'HKQuantityTypeIdentifierOxygenSaturation'
 'HKQuantityTypeIdentifierRespiratoryRate'
 'HKQuantityTypeIdentifierStepCount'
 'HKQuantityTypeIdentifierDistanceWalkingRunning'
 'HKQuantityTypeIdentifierBasalEnergyBurned'
 'HKQuantityTypeIdentifierActiveEnergyBurned'
 'HKQuantityTypeIdentifierFlightsClimbed'
 'HKQuantityTypeIdentifierDietaryFatTotal'
 'HKQuantityTypeIdentifierDietaryFatSaturated'
 'HKQuantityTypeIdentifierDietaryCholesterol'
 'HKQuantityTypeIdentifierDietarySodium'
 'HKQuantityTypeIdentifierDietaryCarbohydrates'
 'HKQuantityTypeIdentifierDietaryFiber'
 'HKQuantityTypeIdentifierDietarySugar'
 'HKQuantityTypeIdentifierDietaryEnergyConsumed'
 'HKQuantityTypeIdentifierDietaryProtein'
 'HKQuantityTypeIdentifierDietaryPotassium'
 'HKQuan

In [4]:
# Step 1: Find unique elements, their attributes, and nested elements
def analyse_xml_structure(element, structure=None, path=""):
    if structure is None:
        structure = defaultdict(lambda: {"attributes": set(), "nested_elements": set()})

    current_path = f"{path}/{element.tag}" if path else element.tag
    structure[current_path]["attributes"].update(element.attrib.keys())

    for child in element:
        structure[current_path]["nested_elements"].add(child.tag)
        analyse_xml_structure(child, structure, current_path)

    return structure


xml_structure = analyse_xml_structure(root)

# Print the structure
for element, info in xml_structure.items():
    print(f"Element: {element}")
    print(f"Attributes: {', '.join(info['attributes'])}")
    print(f"Nested Elements: {', '.join(info['nested_elements'])}")
    print()

Element: HealthData
Attributes: locale
Nested Elements: ActivitySummary, Me, Record, ExportDate, Workout

Element: HealthData/ExportDate
Attributes: value
Nested Elements: 

Element: HealthData/Me
Attributes: HKCharacteristicTypeIdentifierCardioFitnessMedicationsUse, HKCharacteristicTypeIdentifierDateOfBirth, HKCharacteristicTypeIdentifierBloodType, HKCharacteristicTypeIdentifierFitzpatrickSkinType, HKCharacteristicTypeIdentifierBiologicalSex
Nested Elements: 

Element: HealthData/Record
Attributes: startDate, value, type, creationDate, endDate, sourceName, unit, sourceVersion, device
Nested Elements: HeartRateVariabilityMetadataList, MetadataEntry

Element: HealthData/Record/MetadataEntry
Attributes: value, key
Nested Elements: 

Element: HealthData/Workout
Attributes: startDate, workoutActivityType, duration, creationDate, endDate, sourceName, durationUnit, sourceVersion, device
Nested Elements: WorkoutActivity, WorkoutStatistics, MetadataEntry, WorkoutRoute, WorkoutEvent

Element: H

In [None]:
# Step 2: Function to extract elements into DataFrames
def extract_element_to_df(root, element_name):
    data = []
    for elem in root.iter(element_name):
        row = elem.attrib.copy()
        for child in elem:
            if list(child):  # If the child has nested elements
                row[child.tag] = ET.tostring(child, encoding="unicode")
            else:
                row[child.tag] = child.text
        data.append(row)
    return pd.DataFrame(data)


# Extract specific elements (example)
record_df = extract_element_to_df(root, "Record")
workout_df = extract_element_to_df(root, "Workout")
activity_summary_df = extract_element_to_df(root, "ActivitySummary")

In [None]:
# Now you can inspect these DataFrames in the variable explorer

# Step 3: Function to save DataFrames to Excel (to be used later)
def save_dfs_to_excel(dfs_dict, file_name):
    file_manager = FileManager()
    with pd.ExcelWriter(file_name) as writer:
        for sheet_name, df in dfs_dict.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)
    file_manager.save_file(FileDirectory.RAW_DATA_PATH, file_name, writer)


# Example usage (commented out for now)
# dfs_to_save = {
#     'Record': record_df,
#     'Workout': workout_df,
#     'ActivitySummary': activity_summary_df
# }
# save_dfs_to_excel(dfs_to_save, 'health_data.xlsx')