In [1]:
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import seaborn as sns
import xmltodict
import datetime as dt
import pandas as pd

In [19]:
STROKE_STYLE_MAP = {
    '0': 'UnknownStrokeStyle',
    '1': 'MixedStrokeStyle',
    '2': 'FreestyleStrokeStyle',
    '3': 'BackstrokeStrokeStyle',
    '4': 'BreaststrokeStrokeStyle',
    '5': 'ButterflyStrokeStyle',
    '6': 'KickboardStrokeStyle'
}


def parse_stroke_style(value):
    """Converts the stroke style value to its corresponding string."""

    return STROKE_STYLE_MAP.get(value, 'UnknownStrokeStyle')

In [3]:
filepath = 'exportar.xml'
doc = None
with open(filepath, 'r') as f:
    doc = xmltodict.parse(f.read())

In [22]:
def fill_na(record):

    record['WeatherTemperature'] = 0 if not 'WeatherTemperature' in record else record['WeatherTemperature']

    for stroke_style in STROKE_STYLE_MAP.values():

        record[stroke_style] = 0 if not stroke_style in record else record[stroke_style]

    return record

In [24]:
workouts = []
for record in doc['HealthData']['Workout']:

    if record['@workoutActivityType'] == 'HKWorkoutActivityTypeSwimming':

        result_dict = {
            'Duration': record['@duration'],
            'CreationDate': record['@creationDate'],
            'StartDate': record['@startDate'],
            'EndDate': record['@endDate']
        }

        metadata_entry = record['MetadataEntry']
        for entry in metadata_entry:

            key = entry['@key']

            if key == 'HKAverageMETs':

                result_dict['AverageMETs'] = entry['@value'].split(' ')[0]

            if key == 'HKWeatherTemperature':

                result_dict['WeatherTemperature'] = entry['@value'].split(' ')[
                    0]

        workout_statistics = record['WorkoutStatistics']
        for stat in workout_statistics:

            type = stat['@type']

            if type == 'HKQuantityTypeIdentifierDistanceSwimming':

                result_dict['DistanceSwimming'] = stat['@sum']

            if type == 'HKQuantityTypeIdentifierActiveEnergyBurned':

                result_dict['EnergyBurned'] = stat['@sum']

        workout_events = record['WorkoutEvent']
        for event in workout_events:

            type = event['@type']
            if type == 'HKWorkoutEventTypeLap':

                metadata_entry = event['MetadataEntry']
                stroke_style = parse_stroke_style(
                    metadata_entry['@value'])
                result_dict[stroke_style] = result_dict.get(
                    stroke_style, 0) + 1

        result_dict = fill_na(result_dict)
        workouts.append(result_dict)

In [25]:
df = pd.DataFrame(workouts)

In [26]:
df.tail()

Unnamed: 0,Duration,CreationDate,StartDate,EndDate,AverageMETs,WeatherTemperature,DistanceSwimming,EnergyBurned,FreestyleStrokeStyle,BreaststrokeStrokeStyle,BackstrokeStrokeStyle,ButterflyStrokeStyle,UnknownStrokeStyle,MixedStrokeStyle,KickboardStrokeStyle
21,43.64309015075366,2020-12-12 14:46:00 -0300,2020-12-12 14:02:20 -0300,2020-12-12 14:45:58 -0300,7.31296,82,1650,292.068,36,4,14,12,0,0,0
22,46.60624763170878,2020-12-16 09:49:01 -0300,2020-12-16 09:02:24 -0300,2020-12-16 09:49:00 -0300,7.35324,0,1875,313.597,47,1,12,13,0,2,0
23,44.53501873413722,2020-12-17 16:02:22 -0300,2020-12-17 15:17:49 -0300,2020-12-17 16:02:21 -0300,7.87204,85,2125,325.529,77,3,3,2,0,0,0
24,43.09363808234533,2020-12-18 18:45:45 -0300,2020-12-18 18:02:38 -0300,2020-12-18 18:45:44 -0300,8.25582,77,2100,338.922,49,10,8,17,0,0,0
25,43.83639253377915,2020-12-22 16:01:11 -0300,2020-12-22 15:17:20 -0300,2020-12-22 16:01:10 -0300,7.60109,77,2000,315.915,60,17,2,1,0,0,0


In [None]:
df.dtypes

In [None]:
for col in ['CreationDate', 'StartDate', 'EndDate']:
    df[col] = pd.to_datetime(df[col])

In [None]:
for col in ['Duration', 'AverageMETs', 'WeatherTemperature', 'DistanceSwimming', 'EnergyBurned']:
    df[col] = pd.to_numeric(df[col])

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
# DistanceSwimming in meters per month
sns.lineplot(x=df['StartDate'].dt.day, y=df['DistanceSwimming'])
plt.show()

In [None]:
# calories burned per day
sns.histplot(x=df['EnergyBurned'], bins=20)
plt.show()

In [None]:
# total distance, total time, calories mean, weather mean
total_distance = df['DistanceSwimming'].sum()
total_time = df['Duration'].sum()
total_calories = df['EnergyBurned'].sum()
weather_mean = df['WeatherTemperature'].mean()
# convert from degree/F to degree
weather_mean_degrees = (weather_mean - 32) * 5/9

print(f"Total distance: {total_distance} meters")
print(f"Total time: {total_time} minutes")
print(f"Total Calories: {total_calories} calories")
print(f"Weather mean: {weather_mean_degrees} degrees")