In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import datetime as dt 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display

In [2]:
# create element tree object 
tree = ET.parse('apple_health_export/export.xml') 

In [3]:
# extract the attributes of health record
root = tree.getroot()
record_list = [x.attrib for x in root.iter('Record')]
record_list

[{'type': 'HKQuantityTypeIdentifierHeight',
  'sourceName': 'Health',
  'sourceVersion': '17.2.1',
  'unit': 'ft',
  'creationDate': '2024-01-19 01:11:52 -0500',
  'startDate': '2024-01-19 01:11:52 -0500',
  'endDate': '2024-01-19 01:11:52 -0500',
  'value': '5.75'},
 {'type': 'HKQuantityTypeIdentifierHeartRate',
  'sourceName': 'Chloe’s Apple\xa0Watch',
  'sourceVersion': '8.0.1',
  'device': '<<HKDevice: 0x3015803c0>, name:Apple Watch, manufacturer:Apple Inc., model:Watch, hardware:Watch3,3, software:8.0.1>',
  'unit': 'count/min',
  'creationDate': '2021-10-21 01:58:27 -0500',
  'startDate': '2021-10-21 01:58:02 -0500',
  'endDate': '2021-10-21 01:58:02 -0500',
  'value': '80'},
 {'type': 'HKQuantityTypeIdentifierHeartRate',
  'sourceName': 'Chloe’s Apple\xa0Watch',
  'sourceVersion': '8.0.1',
  'device': '<<HKDevice: 0x3015803c0>, name:Apple Watch, manufacturer:Apple Inc., model:Watch, hardware:Watch3,3, software:8.0.1>',
  'unit': 'count/min',
  'creationDate': '2021-10-21 02:00:1

In [4]:
# create a DataFrame from record_list
record_data = pd.DataFrame(record_list)

# print the information of record_data
print(record_data.info())

# show the record_data DataFrame
display(record_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1545669 entries, 0 to 1545668
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   type           1545669 non-null  object
 1   sourceName     1545669 non-null  object
 2   sourceVersion  1545668 non-null  object
 3   unit           1523172 non-null  object
 4   creationDate   1545669 non-null  object
 5   startDate      1545669 non-null  object
 6   endDate        1545669 non-null  object
 7   value          1545669 non-null  object
 8   device         1502383 non-null  object
dtypes: object(9)
memory usage: 106.1+ MB
None


Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device
0,HKQuantityTypeIdentifierHeight,Health,17.2.1,ft,2024-01-19 01:11:52 -0500,2024-01-19 01:11:52 -0500,2024-01-19 01:11:52 -0500,5.75,
1,HKQuantityTypeIdentifierHeartRate,Chloe’s Apple Watch,8.0.1,count/min,2021-10-21 01:58:27 -0500,2021-10-21 01:58:02 -0500,2021-10-21 01:58:02 -0500,80,"<<HKDevice: 0x3015803c0>, name:Apple Watch, ma..."
2,HKQuantityTypeIdentifierHeartRate,Chloe’s Apple Watch,8.0.1,count/min,2021-10-21 02:00:18 -0500,2021-10-21 01:59:03 -0500,2021-10-21 01:59:03 -0500,76.5744,"<<HKDevice: 0x3015803c0>, name:Apple Watch, ma..."
3,HKQuantityTypeIdentifierHeartRate,Chloe’s Apple Watch,8.0.1,count/min,2021-10-21 02:04:04 -0500,2021-10-21 02:01:39 -0500,2021-10-21 02:01:39 -0500,76,"<<HKDevice: 0x3015803c0>, name:Apple Watch, ma..."
4,HKQuantityTypeIdentifierHeartRate,Chloe’s Apple Watch,8.0.1,count/min,2021-10-21 02:09:22 -0500,2021-10-21 02:04:49 -0500,2021-10-21 02:04:49 -0500,74,"<<HKDevice: 0x3015803c0>, name:Apple Watch, ma..."
...,...,...,...,...,...,...,...,...,...
1545664,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Chloe’s Apple Watch,8.8.1,ms,2024-04-12 11:48:47 -0500,2024-04-12 11:47:37 -0500,2024-04-12 11:48:36 -0500,18.2037,"<<HKDevice: 0x301583750>, name:Apple Watch, ma..."
1545665,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Chloe’s Apple Watch,8.8.1,ms,2024-04-12 13:21:09 -0500,2024-04-12 13:19:54 -0500,2024-04-12 13:20:53 -0500,22.1422,"<<HKDevice: 0x301583750>, name:Apple Watch, ma..."
1545666,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Chloe’s Apple Watch,8.8.1,ms,2024-04-12 15:20:57 -0500,2024-04-12 15:19:42 -0500,2024-04-12 15:20:42 -0500,73.5801,"<<HKDevice: 0x301583750>, name:Apple Watch, ma..."
1545667,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Chloe’s Apple Watch,8.8.1,ms,2024-04-12 17:54:48 -0500,2024-04-12 17:53:36 -0500,2024-04-12 17:54:17 -0500,27.2249,"<<HKDevice: 0x301583750>, name:Apple Watch, ma..."


In [5]:
# unique elements in 'type' column of record_data
record_data.type.unique()

array(['HKQuantityTypeIdentifierHeight',
       'HKQuantityTypeIdentifierHeartRate',
       'HKQuantityTypeIdentifierRespiratoryRate',
       'HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierBasalEnergyBurned',
       'HKQuantityTypeIdentifierActiveEnergyBurned',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierDietaryFatTotal',
       'HKQuantityTypeIdentifierDietaryFatSaturated',
       'HKQuantityTypeIdentifierDietarySodium',
       'HKQuantityTypeIdentifierDietarySugar',
       'HKQuantityTypeIdentifierDietaryProtein',
       'HKQuantityTypeIdentifierDietaryVitaminD',
       'HKQuantityTypeIdentifierDietaryIron',
       'HKQuantityTypeIdentifierAppleExerciseTime',
       'HKQuantityTypeIdentifierDietaryCaffeine',
       'HKQuantityTypeIdentifierDistanceSwimming',
       'HKQuantityTypeIdentifierSwimmingStrokeCount',
       'HKQuantityTypeIdentifierRestingHeartRate',
       '

In [13]:
# remove 'sourceName', 'sourceVersion', 'device', 'creationDate', 'endDate' columns
record_data_cleaned = record_data.drop(['sourceName','sourceVersion', 'device', 'creationDate','endDate'], axis=1)

# transform 'startDate' into date format 
# record_data['startDate'] = pd.to_datetime(record_data['startDate']).dt.strftime('%Y-%m-%d')
record_data_cleaned['Day'] = pd.to_datetime(record_data['startDate']).dt.strftime('%A')
record_data_cleaned['Date'] = pd.to_datetime(record_data['startDate']).dt.strftime('%Y-%m-%d')
record_data_cleaned['Month'] = pd.to_datetime(record_data['startDate']).dt.strftime('%B')


# value is numeric, NaN if fails
record_data_cleaned['value'] = pd.to_numeric(record_data['value'], errors='coerce')

# shorter observation names
record_data_cleaned['type'] = record_data_cleaned['type'].str.replace('HKQuantityTypeIdentifier', '')
record_data_cleaned['type'] = record_data_cleaned['type'].str.replace('HKCategoryTypeIdentifier', '')

# reorder 'record_data' columns
record_data_cleaned = record_data_cleaned[['type', 'Date','Day', 'Month','value','unit']]

In [14]:
# dictionary of DataFrames for filtered 'record_data'
record_data_df_dict = {}

# filter 'type' of 'record_data'
record_types = [
   'ActiveEnergyBurned',
   'BasalEnergyBurned',
   'DistanceWalkingRunning',
   'StepCount',
   'AppleStandTime',
   'WalkingSpeed',
   'RunningSpeed',
   'HeartRate',
   'RestingHeartRate',
   'WalkingHeartRateAverage',
   'FlightsClimbed',
   ]


# create new DataFrame for every interested data
for record_type in record_types:
   record_data_df_dict[record_type] = record_data_cleaned.loc[(record_data_cleaned['type'].str.contains(record_type))].rename(columns={"value":record_type}).sort_values(by='Date')

In [15]:
# list of data 'type' that need to be summed daily
key_get_sum = [
    'BasalEnergyBurned', 
    'ActiveEnergyBurned',
    'DistanceWalkingRunning',
    'StepCount',
    'AppleStandTime',
    'FlightsClimbed'
    ]

record_data_df_dict_daily = {}
for key in key_get_sum:
    record_data_df_dict_daily[key] = record_data_df_dict[key].groupby(record_data_df_dict[key]['Date']).agg({key: 'sum', 'Day': lambda x: x.mode().iat[0]}).reset_index()

In [16]:
record_data_df_dict_monthly = {}
for key in key_get_sum:
    record_data_df_dict_monthly[key] = record_data_df_dict[key].groupby(record_data_df_dict[key]['Date'].str[:-3]).agg({key: 'sum', 'Month': lambda x: x.mode().iat[0]}).reset_index()

In [26]:
record_data_df_dict_daily

{'BasalEnergyBurned':            Date  BasalEnergyBurned        Day
 0    2021-10-21           1609.118   Thursday
 1    2021-10-22           1603.657     Friday
 2    2021-10-23           1558.498   Saturday
 3    2021-10-24           1581.017     Sunday
 4    2021-10-25           1529.068     Monday
 ..          ...                ...        ...
 893  2024-04-08           1568.968     Monday
 894  2024-04-09           1746.560    Tuesday
 895  2024-04-10           1705.533  Wednesday
 896  2024-04-11           1593.286   Thursday
 897  2024-04-12           1312.554     Friday
 
 [898 rows x 3 columns],
 'ActiveEnergyBurned':            Date  ActiveEnergyBurned        Day
 0    2021-10-21             654.440   Thursday
 1    2021-10-22             364.501     Friday
 2    2021-10-23             185.123   Saturday
 3    2021-10-24             193.342     Sunday
 4    2021-10-25             140.348     Monday
 ..          ...                 ...        ...
 890  2024-04-08             2

In [36]:
daily_data = []
goals = list(root.iter('Record'))

for i in range(len(goals)):
    goal_dict = goals[i].attrib
    GoalStatisticsList = list(goals[i].iter("Goal"))

    for i, GoalStatistics in enumerate(GoalStatisticsList):
        if "StepCount" in GoalStatistics.attrib['type']:
            goal_dict['StepCount'] = GoalStatistics.attrib['sum']
        if "HeartRate" in GoalStatistics.attrib['type']:
            goal_dict['HeartRate'] = GoalStatistics.attrib['avg']
        if "StandTime" in GoalStatistics.attrib['type']:
            goal_dict['StandTime'] = GoalStatistics.attrib['sum']

    daily_data.append(goal_dict)

final_goal_df = pd.DataFrame(daily_data) 
#create final_workout_df dataframe
final_goal_df.type.unique()

array(['HKQuantityTypeIdentifierHeight',
       'HKQuantityTypeIdentifierHeartRate',
       'HKQuantityTypeIdentifierRespiratoryRate',
       'HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierBasalEnergyBurned',
       'HKQuantityTypeIdentifierActiveEnergyBurned',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierDietaryFatTotal',
       'HKQuantityTypeIdentifierDietaryFatSaturated',
       'HKQuantityTypeIdentifierDietarySodium',
       'HKQuantityTypeIdentifierDietarySugar',
       'HKQuantityTypeIdentifierDietaryProtein',
       'HKQuantityTypeIdentifierDietaryVitaminD',
       'HKQuantityTypeIdentifierDietaryIron',
       'HKQuantityTypeIdentifierAppleExerciseTime',
       'HKQuantityTypeIdentifierDietaryCaffeine',
       'HKQuantityTypeIdentifierDistanceSwimming',
       'HKQuantityTypeIdentifierSwimmingStrokeCount',
       'HKQuantityTypeIdentifierRestingHeartRate',
       '

In [39]:
# # selecting certain types
# final_heart_df = final_goal_df[final_goal_df['type'] == "HKQuantityTypeIdentifierHeartRate"]
# final_step_df = final_goal_df[final_goal_df['type'] == "HKQuantityTypeIdentifierStepCount"]
# final_stand_df = final_goal_df[final_goal_df['type'] == "HKQuantityTypeIdentifierStandTime"]
# final_2_df = pd.merge(final_heart_df, final_step_df, on='creationDate')
# final_3_df = pd.merge(final_2_df, final_stand_df, on = 'creationDate')

In [40]:
# final_3_df

Unnamed: 0,type_x,sourceName_x,sourceVersion_x,unit_x,creationDate,startDate_x,endDate_x,value_x,device_x,type_y,...,value_y,device_y,type,sourceName,sourceVersion,unit,startDate,endDate,value,device


In [42]:
#drop 'creationDate' and 'endDate' column
final_df_cleaned = final_goal_df.drop(['sourceName','sourceVersion', 'device', 'creationDate','endDate'], axis=1)

# transform creationDate into date format 
final_df_cleaned['Date'] = pd.to_datetime(final_goal_df['startDate']).dt.strftime('%Y-%m-%d')
final_df_cleaned['Day'] = pd.to_datetime(final_goal_df['startDate']).dt.strftime('%A')

# rename Activity Type
final_df_cleaned['ActivityType'] = final_goal_df['type'].str.replace('HKQuantityTypeIdentifier','')

# reorder column
final_workout_df_cleaned = final_df_cleaned[['Day', 'Date', 'startDate', 'value', 'unit', 'ActivityType']]

display(final_df_cleaned)

Unnamed: 0,type,unit,startDate,value,Date,Day,ActivityType
0,HKQuantityTypeIdentifierHeight,ft,2024-01-19 01:11:52 -0500,5.75,2024-01-19,Friday,Height
1,HKQuantityTypeIdentifierHeartRate,count/min,2021-10-21 01:58:02 -0500,80,2021-10-21,Thursday,HeartRate
2,HKQuantityTypeIdentifierHeartRate,count/min,2021-10-21 01:59:03 -0500,76.5744,2021-10-21,Thursday,HeartRate
3,HKQuantityTypeIdentifierHeartRate,count/min,2021-10-21 02:01:39 -0500,76,2021-10-21,Thursday,HeartRate
4,HKQuantityTypeIdentifierHeartRate,count/min,2021-10-21 02:04:49 -0500,74,2021-10-21,Thursday,HeartRate
...,...,...,...,...,...,...,...
1545664,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,ms,2024-04-12 11:47:37 -0500,18.2037,2024-04-12,Friday,HeartRateVariabilitySDNN
1545665,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,ms,2024-04-12 13:19:54 -0500,22.1422,2024-04-12,Friday,HeartRateVariabilitySDNN
1545666,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,ms,2024-04-12 15:19:42 -0500,73.5801,2024-04-12,Friday,HeartRateVariabilitySDNN
1545667,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,ms,2024-04-12 17:53:36 -0500,27.2249,2024-04-12,Friday,HeartRateVariabilitySDNN


In [43]:
final_df_cleaned.to_csv("Chloe_Full_Health.csv")

In [18]:
# #drop 'creationDate' and 'endDate' column
# final_workout_df_cleaned = final_goal_df.drop(['sourceName','sourceVersion', 'device', 'creationDate','endDate'], axis=1)

# # transform creationDate into date format 
# final_workout_df_cleaned['Date'] = pd.to_datetime(final_goal_df['startDate']).dt.strftime('%Y-%m-%d')
# final_workout_df_cleaned['Day'] = pd.to_datetime(final_goal_df['startDate']).dt.strftime('%A')

# # rename Activity Type
# final_workout_df_cleaned['ActivityType'] = final_goal_df['type'].str.replace('HKQuantityTypeIdentifier','')

# # reorder column
# final_workout_df_cleaned = final_workout_df_cleaned[['Day', 'Date', 'ActivityType', 'value', 'durationUnit', 'activeEnergyBurned', 'basalEnergyBurned']]

# # transform data type of 'duration' from object into float
# final_workout_df_cleaned['duration'] = final_goal_df['duration'].astype(float)

# # transform data type of 'activeEnergyBurned' and 'basalEnergyBurned' from object into float
# final_workout_df_cleaned['activeEnergyBurned'] = final_goal_df['activeEnergyBurned'].astype(float)
# final_workout_df_cleaned['basalEnergyBurned'] = final_goal_df['basalEnergyBurned'].astype(float)

# display(final_workout_df_cleaned)

KeyboardInterrupt: 

In [6]:
# return recorded Steps Counted
# record_data.loc[(record_data['type'].str.contains("StepCount"))]

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device
261341,HKQuantityTypeIdentifierStepCount,Chloe’s iPhone (2),14.7.1,count,2021-10-11 13:58:31 -0500,2021-10-11 13:47:27 -0500,2021-10-11 13:51:20 -0500,296,"<<HKDevice: 0x3015807d0>, name:iPhone, manufac..."
261342,HKQuantityTypeIdentifierStepCount,Chloe’s iPhone (2),14.7.1,count,2021-10-11 15:02:20 -0500,2021-10-11 14:51:18 -0500,2021-10-11 15:01:17 -0500,990,"<<HKDevice: 0x3015807d0>, name:iPhone, manufac..."
261343,HKQuantityTypeIdentifierStepCount,Chloe’s iPhone (2),14.7.1,count,2021-10-11 15:12:37 -0500,2021-10-11 15:01:17 -0500,2021-10-11 15:06:19 -0500,305,"<<HKDevice: 0x3015807d0>, name:iPhone, manufac..."
261344,HKQuantityTypeIdentifierStepCount,Chloe’s iPhone (2),14.7.1,count,2021-10-11 18:45:51 -0500,2021-10-11 18:34:18 -0500,2021-10-11 18:43:41 -0500,594,"<<HKDevice: 0x3015807d0>, name:iPhone, manufac..."
261345,HKQuantityTypeIdentifierStepCount,Chloe’s iPhone (2),14.7.1,count,2021-10-11 18:55:35 -0500,2021-10-11 18:44:33 -0500,2021-10-11 18:44:56 -0500,46,"<<HKDevice: 0x3015807d0>, name:iPhone, manufac..."
...,...,...,...,...,...,...,...,...,...
370078,HKQuantityTypeIdentifierStepCount,Chloe’s Apple Watch,8.8.1,count,2024-04-12 19:42:30 -0500,2024-04-12 19:37:31 -0500,2024-04-12 19:38:29 -0500,74,"<<HKDevice: 0x301581590>, name:Apple Watch, ma..."
370079,HKQuantityTypeIdentifierStepCount,Chloe’s Apple Watch,8.8.1,count,2024-04-12 19:42:30 -0500,2024-04-12 19:38:29 -0500,2024-04-12 19:39:03 -0500,20,"<<HKDevice: 0x301581590>, name:Apple Watch, ma..."
370080,HKQuantityTypeIdentifierStepCount,Chloe’s Apple Watch,8.8.1,count,2024-04-12 19:42:30 -0500,2024-04-12 19:39:28 -0500,2024-04-12 19:40:27 -0500,59,"<<HKDevice: 0x301581590>, name:Apple Watch, ma..."
370081,HKQuantityTypeIdentifierStepCount,Chloe’s Apple Watch,8.8.1,count,2024-04-12 19:42:30 -0500,2024-04-12 19:40:30 -0500,2024-04-12 19:40:43 -0500,21,"<<HKDevice: 0x301581590>, name:Apple Watch, ma..."
