# Load Data from apple_health_export/ 
Originally as XML file, want to convert to csv

In [3]:
# Load needed packages
import pandas as pd
import xmltodict

In [4]:
# use your own directory, this is where I exported my data
data_path = 'apple_health_export/export.xml' 

# read XML file, convert to a dictionary
with open(data_path, 'r') as xml_file:
    all_data = xmltodict.parse(xml_file.read())

In [5]:
# look at all data available (remember it is in a dictionary)
all_data['HealthData'].keys()

odict_keys(['@locale', 'ExportDate', 'Me', 'Record', 'ActivitySummary'])

In [6]:
# create a records list for overall health data, put into pandas dataframe
records_list = all_data['HealthData']['Record']
all_records = pd.DataFrame(records_list)

In [7]:
# activity summary (acts)
acts_list = all_data['HealthData']['ActivitySummary']
acts_df = pd.DataFrame(acts_list)

In [8]:
# glimpse at data
acts_df.head()

Unnamed: 0,@dateComponents,@activeEnergyBurned,@activeEnergyBurnedGoal,@activeEnergyBurnedUnit,@appleMoveTime,@appleMoveTimeGoal,@appleExerciseTime,@appleExerciseTimeGoal,@appleStandHours,@appleStandHoursGoal
0,2022-09-13,313.166,300,kcal,0,0,0,0,0,0
1,2022-09-14,391.495,300,kcal,0,0,0,0,0,0
2,2022-09-15,510.161,300,kcal,0,0,0,0,0,0
3,2022-09-16,360.928,300,kcal,0,0,0,0,0,0
4,2022-09-17,362.137,300,kcal,0,0,0,0,0,0


# Now that data is imported, We need to clean it.

In [9]:
## Cleaning 

# remove special characters from column names 
acts_df.columns = acts_df.columns.str.replace('@', '')

# convert date column to date format
acts_df['dateComponents'] = pd.to_datetime(acts_df['dateComponents'])

# make sure dates are only within range from 07/2021-recent (3/2023),
acts_df = acts_df[(acts_df['dateComponents'] > '2021-07-30') & (acts_df['dateComponents'] <= '2023-03-25')]

In [10]:
acts_df.head()

Unnamed: 0,dateComponents,activeEnergyBurned,activeEnergyBurnedGoal,activeEnergyBurnedUnit,appleMoveTime,appleMoveTimeGoal,appleExerciseTime,appleExerciseTimeGoal,appleStandHours,appleStandHoursGoal
0,2022-09-13,313.166,300,kcal,0,0,0,0,0,0
1,2022-09-14,391.495,300,kcal,0,0,0,0,0,0
2,2022-09-15,510.161,300,kcal,0,0,0,0,0,0
3,2022-09-16,360.928,300,kcal,0,0,0,0,0,0
4,2022-09-17,362.137,300,kcal,0,0,0,0,0,0


Now that dates and columns are done, lets look at data types

In [11]:
acts_df.dtypes

dateComponents            datetime64[ns]
activeEnergyBurned                object
activeEnergyBurnedGoal            object
activeEnergyBurnedUnit            object
appleMoveTime                     object
appleMoveTimeGoal                 object
appleExerciseTime                 object
appleExerciseTimeGoal             object
appleStandHours                   object
appleStandHoursGoal               object
dtype: object

In [12]:
# convert the rest of the columns to integers (except for Unit)
cols = acts_df.columns
acts_df[cols[1:3]] = acts_df[cols[1:3]].apply(pd.to_numeric, errors='coerce')
acts_df[cols[4:]] = acts_df[cols[4:]].apply(pd.to_numeric, errors='coerce')

# round active energy so it is consistent
acts_df = acts_df.round()

# For more analyses and visualizations, I will also clean the workouts and put into a dataframe

In [13]:
# remove special characters from column names 
all_records.columns = all_records.columns.str.replace('@', '')

In [14]:
# column labels
all_records['type'].unique()

array(['HKQuantityTypeIdentifierHeight',
       'HKQuantityTypeIdentifierBodyMass',
       'HKQuantityTypeIdentifierHeartRate',
       'HKQuantityTypeIdentifierRespiratoryRate',
       'HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierBasalEnergyBurned',
       'HKQuantityTypeIdentifierActiveEnergyBurned',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierHeadphoneAudioExposure',
       'HKQuantityTypeIdentifierWalkingDoubleSupportPercentage',
       'HKQuantityTypeIdentifierWalkingSpeed',
       'HKQuantityTypeIdentifierWalkingStepLength',
       'HKQuantityTypeIdentifierWalkingAsymmetryPercentage',
       'HKDataTypeSleepDurationGoal',
       'HKQuantityTypeIdentifierAppleWalkingSteadiness',
       'HKCategoryTypeIdentifierSleepAnalysis',
       'HKCategoryTypeIdentifierHeadphoneAudioExposureEvent'],
      dtype=object)

# Now that data is consistent, will write to csv and use from here on out

In [15]:
# activity summary
acts_df.to_csv("data/activitysummary.csv", index=False)

In [16]:
# all records
all_records.to_csv("apple_health_export/all_records.csv")