<a href="https://colab.research.google.com/github/hankedwards/AppleWatch/blob/master/AppleHealth2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Analyze Apple Health Data

## Import the libraries

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import datetime
from datetime import datetime, timedelta
import numpy as np

In [None]:
from google.colab import drive
# drive.mount('/content/drive')
drive.mount("/content/drive", force_remount=True)

## Create element tree object

In [None]:
tree =ET.parse('/content/drive/MyDrive/Colab Notebooks/Apple Health Data/export.xml')

In [None]:
# for every health record, extract the attributes into roo
root = tree.getroot()
record_list = [x.attrib for x in root.iter('Record')]

In [None]:
data = pd.DataFrame(record_list)

## Enable Google Colab Dataframe formatter

In [None]:
from google.colab import data_table
# from vega_datasets import data

data_table.enable_dataframe_formatter()

In [None]:
# proper type to dates
for  col in ['creationDate', 'startDate', 'endDate']:
  data[col] = pd.to_datetime(data[col])

In [None]:
# value is numeric, NaN if fails
data['value'] =pd.to_numeric(data['value'], errors = 'coerce')

# some records do not measure anything, just count occurences
# filling with 1.0 (= one time )makes it easier to aggregate
data['value'] = data['value'].fillna(1.0)

In [None]:
data['type'] = data['type'].str.replace('HKQuantityTypeIdentifier', '')
data['type'] = data['type'].str.replace('HKCategoryTypeIdentifier', '')

In [None]:
# Show distribution of type records
data['type'].value_counts()

In [None]:
print(data.info())

In [None]:
swim_df = data[data['creationDate'] >= '2020-01-01']
swim_df.drop(['sourceName', 'sourceVersion','device'], axis=1, inplace = True)
swim_df1 = swim_df[swim_df['type'] == 'DistanceSwimming']
print(swim_df1.info())
print(swim_df1.head(5))

## Now get the seconds for each lap

In [None]:
# Convert timedelta or lap duration into float for seconds
# I had the haardest time trying to figure out this syntax
swim_df1['secperlap'] = (swim_df1['endDate'] - swim_df1['startDate'])/ np.timedelta64(1, 's')
swim_df1['start'] = pd.to_datetime(swim_df1['startDate']).dt.date


In [None]:
# lets look at why August 22 2021 has bad data
# bad_df['start'] = pd.to_datetime(swim_df1['creationDate'], format='%m/%d/%Y')
bad_df = (swim_df1[swim_df1['start'] <= '2021-08-23'] & swim_df1[swim_df1['start'] >= '2021-08-21'])
bad_df

### Now get statistics on seconds per lap

In [None]:
swim_df1[["secperlap"]].describe()

In [None]:
swim_by_date = swim_df1.groupby("start")["secperlap"].mean()

In [None]:
swim_by_date

In [None]:
swim_by_datedf = swim_by_date.to_frame().reset_index()
print(swim_by_datedf.info())

In [None]:
swim_by_datedf

In [None]:
# lets look at why August 22 2021 has bad data
bad_df = swim_by_datedf[swim_by_datedf['start'] >= '2021-08-22']
bad_df

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(x=swim_by_datedf['start'],y=swim_by_datedf['secperlap']))
fig.update_layout(title_text='<b>Average Time per Lap</b>', title_x=0.5)
# fig = go.Figure(data=go.Scatter(x=fmsdx1.index,y=fmsdx1['Close'], mode='lines'))
fig.show()

In [None]:
import plotly.express as px
# df = px.swim_df1.secperlap()s
fig = px.histogram(swim_df1, x="secperlap",
                   marginal="violin") # or violin, rug
                  #  hover_data=df.columns)
fig.show()

In [None]:
# import plotly.express as px
# df = px.data.tips()
fig = px.scatter(swim_by_datedf, x="start", y='secperlap')
fig.show()

In [None]:
swim_df1['start'] = pd.to_datetime(swim_df1['startDate']).dt.date
# df["date"] = pd.to_datetime(df["date"]).dt.date

In [None]:
import seaborn as sns
sns.distplot(swim_df1, kde=False, color="b", ax=axes[0, 0])

In [None]:
data = swim_df1['secperlap']

sns.displot(data, discrete = True, kde = True)

# plt.show()

In [None]:
# swim_df1.groupby(start)["secperlap"].mean()

In [None]:
table = pd.pivot_table(data=swim_df1,index=['start'],aggfunc = sum)
table['avgtimeper']= round(table['timedelta'] / table['value'],2)
table['avgtime']= round((table['timedelta'] / 60),2)
table
# print(table.info())

In [None]:
import numpy as np

# pivot and resample
pivot_df = data.pivot_table(index='endDate', columns='type', values='value')
df = pivot_df.resample('D').agg({'DistanceSwimming' :sum})
                                #  'DistanceWalkingRunning' : sum})