## ***Analysis Of My Apple Watch Data***


In [135]:
import pandas as pd
import xmltodict

- ### Step 1. Data Collection

In [136]:
input_path = 'apple_health_export\\export.xml'
with open(input_path,'r') as xml_file:
    input_data = xmltodict.parse(xml_file.read())

In [137]:
records_list = input_data['HealthData']['Record']
df_healthData = pd.DataFrame(records_list)
df_healthData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228585 entries, 0 to 228584
Data columns (total 11 columns):
@type                               228585 non-null object
@sourceName                         228585 non-null object
@sourceVersion                      228581 non-null object
@unit                               226988 non-null object
@creationDate                       228585 non-null object
@startDate                          228585 non-null object
@endDate                            228585 non-null object
@value                              228578 non-null object
@device                             228238 non-null object
MetadataEntry                       18282 non-null object
HeartRateVariabilityMetadataList    341 non-null object
dtypes: object(11)
memory usage: 19.2+ MB


**Removing the @ from columns**

In [138]:
df_healthData.columns = df_records.columns.str.replace('@','')
df_healthData.columns

Index(['type', 'sourceName', 'sourceVersion', 'unit', 'creationDate',
       'startDate', 'endDate', 'value', 'device', 'MetadataEntry',
       'HeartRateVariabilityMetadataList'],
      dtype='object')

In [139]:
df_healthData.describe()

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device,MetadataEntry,HeartRateVariabilityMetadataList
count,228585,228585,228581,226988,228585,228585,228585,228578,228238,18282,341
unique,17,4,22,9,69480,174803,174668,38166,85863,6,341
top,HKQuantityTypeIdentifierActiveEnergyBurned,Devâ€™s AppleÂ Watch,5.3.1,kcal,2019-11-19 18:15:29 +0530,2019-08-09 13:34:00 +0530,2019-07-22 13:29:05 +0530,1,"<<HKDevice: 0x2839e11d0>, name:Apple Watch, ma...",{'@key': 'HKMetadataKeyHeartRateMotionContext'...,{'InstantaneousBeatsPerMinute': [{'@bpm': '73'...
freq,85145,151998,84411,105682,289,5,6,9109,15,12207,1


- ### Step 2. Data Cleaning 

Data Cleaning is very important part because its *Garbage In Garbage Out* sceniro. Currently the data is disturbed and messed. We will now clean the data and make it more sensible for analysis purpose.

In [140]:
df_healthData

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device,MetadataEntry,HeartRateVariabilityMetadataList
0,HKQuantityTypeIdentifierHeight,Health,10.0.2,cm,2016-10-23 00:01:20 +0530,2016-10-23 00:01:20 +0530,2016-10-23 00:01:20 +0530,168,,,
1,HKQuantityTypeIdentifierBodyMass,Health,10.0.2,kg,2016-10-23 00:01:20 +0530,2016-10-23 00:01:20 +0530,2016-10-23 00:01:20 +0530,56,,,
2,HKQuantityTypeIdentifierBodyMass,iPhone,12.3.1,kg,2019-06-21 07:38:54 +0530,2019-06-21 07:38:53 +0530,2019-06-21 07:38:53 +0530,62,,,
3,HKQuantityTypeIdentifierHeartRate,Devâ€™s AppleÂ Watch,5.1.1,count/min,2019-06-21 07:45:34 +0530,2019-06-21 07:44:06 +0530,2019-06-21 07:44:06 +0530,65,"<<HKDevice: 0x28395caa0>, name:Apple Watch, ma...",{'@key': 'HKMetadataKeyHeartRateMotionContext'...,
4,HKQuantityTypeIdentifierHeartRate,Devâ€™s AppleÂ Watch,5.1.1,count/min,2019-06-21 07:47:08 +0530,2019-06-21 07:46:05 +0530,2019-06-21 07:46:05 +0530,62.439,"<<HKDevice: 0x28395ce60>, name:Apple Watch, ma...",{'@key': 'HKMetadataKeyHeartRateMotionContext'...,
...,...,...,...,...,...,...,...,...,...,...,...
228580,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Devâ€™s AppleÂ Watch,5.3.1,ms,2020-02-17 14:04:52 +0530,2020-02-17 14:03:47 +0530,2020-02-17 14:04:52 +0530,47.8259,"<<HKDevice: 0x283953d90>, name:Apple Watch, ma...",,{'InstantaneousBeatsPerMinute': [{'@bpm': '80'...
228581,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Devâ€™s AppleÂ Watch,5.3.1,ms,2020-02-17 17:22:42 +0530,2020-02-17 17:21:39 +0530,2020-02-17 17:22:42 +0530,36.2858,"<<HKDevice: 0x283950550>, name:Apple Watch, ma...",,{'InstantaneousBeatsPerMinute': [{'@bpm': '90'...
228582,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Devâ€™s AppleÂ Watch,5.3.1,ms,2020-02-17 20:56:16 +0530,2020-02-17 20:55:11 +0530,2020-02-17 20:56:16 +0530,18.2534,"<<HKDevice: 0x283953020>, name:Apple Watch, ma...",,{'InstantaneousBeatsPerMinute': [{'@bpm': '78'...
228583,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Devâ€™s AppleÂ Watch,5.3.1,ms,2020-02-18 14:19:47 +0530,2020-02-18 14:18:43 +0530,2020-02-18 14:19:47 +0530,59.6529,"<<HKDevice: 0x2839537a0>, name:Apple Watch, ma...",,{'InstantaneousBeatsPerMinute': [{'@bpm': '96'...


**Read** : As we observe, the data is not clean. For example - *type* column has lengthy string values with some prefix, sourceName having special charaters in its values etc.

We will now cleanup the data by taking all the columns one by one. We will fill the null/NA values and make data more uniform and sensible for analysis.

In [141]:
# Getting the unique values
df_healthData.type.unique()

array(['HKQuantityTypeIdentifierHeight',
       'HKQuantityTypeIdentifierBodyMass',
       'HKQuantityTypeIdentifierHeartRate',
       'HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierBasalEnergyBurned',
       'HKQuantityTypeIdentifierActiveEnergyBurned',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierAppleExerciseTime',
       'HKQuantityTypeIdentifierRestingHeartRate',
       'HKQuantityTypeIdentifierVO2Max',
       'HKQuantityTypeIdentifierWalkingHeartRateAverage',
       'HKCategoryTypeIdentifierSleepAnalysis',
       'HKCategoryTypeIdentifierAppleStandHour',
       'HKCategoryTypeIdentifierMindfulSession',
       'HKCategoryTypeIdentifierHighHeartRateEvent',
       'HKQuantityTypeIdentifierHeartRateVariabilitySDNN'], dtype=object)

In [142]:
df_healthData['type'].replace(to_replace=['HKQuantityTypeIdentifier',
                                                     'HKCategoryTypeIdentifier',
                                                     'HeartRateVariability'],value='',inplace=True,regex=True)
df_healthData.type.unique()

array(['Height', 'BodyMass', 'HeartRate', 'StepCount',
       'DistanceWalkingRunning', 'BasalEnergyBurned',
       'ActiveEnergyBurned', 'FlightsClimbed', 'AppleExerciseTime',
       'RestingHeartRate', 'VO2Max', 'WalkingHeartRateAverage',
       'SleepAnalysis', 'AppleStandHour', 'MindfulSession',
       'HighHeartRateEvent', 'SDNN'], dtype=object)

**Read** : As the first column *type* is now clean as it lengthy string values converted in short and more readable values and not having null/NA values, now we will move on to the second column *sourceName*.

In [143]:
# Observing the unqiure values
df_healthData.sourceName.unique()

array(['Health', 'iPhone', 'Devâ€™s AppleÂ\xa0Watch', 'Clock'],
      dtype=object)

**Read** : As we see we are having the garbage value in this columns, lets replace it with iWatch.


In [144]:
df_healthData.sourceName.replace('Devâ€™s AppleÂ\xa0Watch','iWatch',inplace=True)
df_healthData.sourceName.unique()

array(['Health', 'iPhone', 'iWatch', 'Clock'], dtype=object)

In [145]:
df_healthData.sourceVersion.unique()

array(['10.0.2', '12.3.1', '5.1.1', '5.3.1', '10.1', '10.1.1', '10.2',
       '10.2.1', '11.0.2', '11.0.3', '11.1', '11.1.1', '11.1.2', '11.2',
       '11.2.1', '11.2.2', '11.2.5', '11.2.6', '11.3', '11.4.1', '12.4.1',
       nan, '50'], dtype=object)

In [146]:
df_healthData = df_healthData[df_healthData.sourceVersion != '50']

In [152]:
df_healthData.sourceVersion.dropna(inplace=True)

In [156]:
df_healthData.sourceVersion.unique()

array(['10.0.2', '12.3.1', '5.1.1', '5.3.1', '10.1', '10.1.1', '10.2',
       '10.2.1', '11.0.2', '11.0.3', '11.1', '11.1.1', '11.1.2', '11.2',
       '11.2.1', '11.2.2', '11.2.5', '11.2.6', '11.3', '11.4.1', '12.4.1'],
      dtype=object)

In [155]:
df_healthData.isnull().sum()

type                                     0
sourceName                               0
sourceVersion                            4
unit                                  1594
creationDate                             0
startDate                                0
endDate                                  0
value                                    7
device                                 347
MetadataEntry                       210303
HeartRateVariabilityMetadataList    228241
dtype: int64

In [157]:
df_healthData.unit.unique()

array(['cm', 'kg', 'count/min', 'count', 'km', 'kcal', 'min',
       'mL/minÂ·kg', nan, 'ms'], dtype=object)

In [158]:
df_healthData.unit.dropna(inplace=True)

In [161]:
df_healthData.unit.isnull().sum()

0