In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

# create element tree object
tree = ET.parse('data/export.xml') 
# for every health record, extract the attributes
root = tree.getroot()
record_list = [x.attrib for x in root.iter('Record')]

In [13]:
record_data = pd.DataFrame(record_list)

# proper type to dates
for col in ['creationDate', 'startDate', 'endDate']:
    record_data[col] = pd.to_datetime(record_data[col])

# value is numeric, NaN if fails
record_data['value'] = pd.to_numeric(record_data['value'], errors='coerce')

# some records do not measure anything, just count occurences
# filling with 1.0 (= one time) makes it easier to aggregate
record_data['value'] = record_data['value'].fillna(1.0)

# shorter observation names
record_data['type'] = record_data['type'].str.replace('HKQuantityTypeIdentifier', '')
record_data['type'] = record_data['type'].str.replace('HKCategoryTypeIdentifier', '')
record_data['type'] = record_data['type'].str.replace('HKQuantityTypeIdentifierBasalEnergyBurned', '')
record_data.head()

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device
0,DietaryWater,HabitMinder,442,mL,2022-09-12 12:00:43-06:00,2022-09-12 12:00:43-06:00,2022-09-12 12:00:43-06:00,250.0,
1,DietaryWater,HabitMinder,442,mL,2022-09-12 12:51:53-06:00,2022-09-12 12:51:53-06:00,2022-09-12 12:51:53-06:00,250.0,
2,DietaryWater,HabitMinder,442,mL,2022-09-12 13:18:14-06:00,2022-09-12 13:18:14-06:00,2022-09-12 13:18:14-06:00,250.0,
3,DietaryWater,HabitMinder,442,mL,2022-09-12 14:35:20-06:00,2022-09-12 14:35:20-06:00,2022-09-12 14:35:20-06:00,250.0,
4,DietaryWater,HabitMinder,442,mL,2022-09-12 15:31:26-06:00,2022-09-12 15:31:26-06:00,2022-09-12 15:31:26-06:00,250.0,


In [4]:
workout_list = [x.attrib for x in root.iter('Workout')]
workout_list

[{'workoutActivityType': 'HKWorkoutActivityTypeWalking',
  'duration': '57.32529756426811',
  'durationUnit': 'min',
  'sourceName': 'Harikrishna’s Apple\xa0Watch',
  'sourceVersion': '6.2.5',
  'device': '<<HKDevice: 0x283bac5f0>, name:Apple Watch, manufacturer:Apple Inc., model:Watch, hardware:Watch3,4, software:6.2.5>',
  'creationDate': '2020-06-23 12:06:38 -0600',
  'startDate': '2020-06-23 11:09:14 -0600',
  'endDate': '2020-06-23 12:06:34 -0600'},
 {'workoutActivityType': 'HKWorkoutActivityTypeWalking',
  'duration': '72.03287528355916',
  'durationUnit': 'min',
  'sourceName': 'Harikrishna’s Apple\xa0Watch',
  'sourceVersion': '6.2.5',
  'device': '<<HKDevice: 0x283bac5f0>, name:Apple Watch, manufacturer:Apple Inc., model:Watch, hardware:Watch3,4, software:6.2.5>',
  'creationDate': '2020-06-24 12:19:52 -0600',
  'startDate': '2020-06-24 11:07:43 -0600',
  'endDate': '2020-06-24 12:19:45 -0600'},
 {'workoutActivityType': 'HKWorkoutActivityTypeWalking',
  'duration': '57.1677604

In [5]:
workout_list = [x.attrib for x in root.iter('Workout')]

# create DataFrame
workout_data = pd.DataFrame(workout_list)

# proper type to dates
for col in ['creationDate', 'startDate', 'endDate']:
    workout_data[col] = pd.to_datetime(workout_data[col])

# convert string to numeric   
workout_data['duration'] = pd.to_numeric(workout_data['duration'])
# workout_data['totalEnergyBurned'] = pd.to_numeric(workout_data['totalEnergyBurned'])
# workout_data['totalDistance'] = pd.to_numeric(workout_data['totalDistance'])
workout_data.head()

Unnamed: 0,workoutActivityType,duration,durationUnit,sourceName,sourceVersion,device,creationDate,startDate,endDate
0,HKWorkoutActivityTypeWalking,57.325298,min,Harikrishna’s Apple Watch,6.2.5,"<<HKDevice: 0x283bac5f0>, name:Apple Watch, ma...",2020-06-23 12:06:38-06:00,2020-06-23 11:09:14-06:00,2020-06-23 12:06:34-06:00
1,HKWorkoutActivityTypeWalking,72.032875,min,Harikrishna’s Apple Watch,6.2.5,"<<HKDevice: 0x283bac5f0>, name:Apple Watch, ma...",2020-06-24 12:19:52-06:00,2020-06-24 11:07:43-06:00,2020-06-24 12:19:45-06:00
2,HKWorkoutActivityTypeWalking,57.16776,min,Harikrishna’s Apple Watch,6.2.8,"<<HKDevice: 0x283bac500>, name:Apple Watch, ma...",2020-08-24 11:25:00-06:00,2020-08-24 10:26:58-06:00,2020-08-24 11:24:53-06:00
3,HKWorkoutActivityTypeWalking,59.581077,min,Harikrishna’s Apple Watch,6.2.8,"<<HKDevice: 0x283bac500>, name:Apple Watch, ma...",2020-08-25 10:58:57-06:00,2020-08-25 09:57:03-06:00,2020-08-25 10:58:51-06:00
4,HKWorkoutActivityTypeWalking,82.133357,min,Harikrishna’s Apple Watch,6.2.8,"<<HKDevice: 0x283bac500>, name:Apple Watch, ma...",2020-08-26 11:07:41-06:00,2020-08-26 09:45:22-06:00,2020-08-26 11:07:30-06:00


In [6]:
num_workouts = workout_data.shape[0]

In [7]:
num_workouts

481

In [11]:
workout_data['sourceName'].value_counts()

sourceName
Harikrishna’s Apple Watch    402
Nike Training                 48
Nike Run Club                 27
Strava                         3
SmartGym                       1
Name: count, dtype: int64

In [12]:
workout_data['workoutActivityType'].value_counts()

workoutActivityType
HKWorkoutActivityTypeWalking                          251
HKWorkoutActivityTypeTraditionalStrengthTraining       89
HKWorkoutActivityTypeRunning                           57
HKWorkoutActivityTypeHighIntensityIntervalTraining     29
HKWorkoutActivityTypeFunctionalStrengthTraining        22
HKWorkoutActivityTypeElliptical                        17
HKWorkoutActivityTypeCycling                            9
HKWorkoutActivityTypeCrossTraining                      3
HKWorkoutActivityTypeHiking                             3
HKWorkoutActivityTypeStairClimbing                      1
Name: count, dtype: int64