# FIT file exploration
__Keith Cheveralls__<br>
__March 2019__

This notebook parses all of the FIT files in a single Strava data dump and explores the message types and fields present in each/all of the files.

This is necessary, and messy, because the message types and fields are device-dependent, firmware-version-dependent, and activity-type-dependent. 

As of March 2019, there are FIT files from the Wahoo Elemnt and the Garmin Forerunner 220, Fenix 3, Edge 520. 

In [None]:
import os
import re
import sys
import gzip
import time
import pickle
import datetime
import numpy as np
from scipy import stats
import pandas as pd

from lxml import etree
import fitparse
from fitparse import FitFile
from matplotlib import pyplot as plt

%matplotlib 
%load_ext autoreload
%autoreload 2

In [None]:
def timer(method):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = method(*args, **kwargs)
        stop = time.time()
        print('Elapsed time: %0.2f' % (stop - start))
        return result
    return wrapper

In [None]:
sys.path.append('../')
import cypy2

root = '/home/keith/Downloads/export_7989839-1'

wahoo_example = '2326365683.fit.gz'
garmin_example = '2122584483.fit.gz'
garmin_indoor_example = '2324139976.fit.gz'

In [None]:
m = cypy2.StravaExportManager(root, from_cache=True)

In [None]:
m.metadata.shape, m.metadata.type.unique()

### Parse and cache data from all FIT files

In [None]:
# For the Strava export from 2019-03-01, parsing all 943 FIT files takes ~30 minutes.
timer(m.parse_all)()

In [None]:
len(m.parsed_data)

In [None]:
m.to_cache()

In [None]:
# check for errors
m.parsing_errors

### Determine message types and fields present in all FIT files

In [None]:
names = ['strava_metadata', 'file_id', 'device_info', 'event', 'session', 'record']
strava_metadata, file_id, device_info, event, session, record = [d[name] for name in names]

In [None]:
# check that there is only one file_id and session message per file
set([d['file_id'].shape[0] for d in m.parsed_data]), set([d['session'].shape[0] for d in m.parsed_data])

In [None]:
# all unique combinations of message types
message_name_lists = list(set([tuple(d.keys()) for d in m.parsed_data]))

# all combinations of message types
sorted([', '.join(sorted(message_names)) for message_names in message_name_lists])

# common message types
', '.join(set(message_name_lists[0]).intersection(*message_name_lists))

In [None]:
# column renaming 
for d in m.parsed_data:
    d['device_info'].rename(columns={'garmin_product': 'product_name'}, inplace=True)

for d in m.parsed_data:
    d['file_id'].rename(columns={'garmin_product': 'product_name', 'product': 'product_name'}, inplace=True)

In [None]:
# cat all file_id messages
dcat = pd.concat(tuple([d['file_id'] for d in m.parsed_data]))
dcat.groupby(['manufacturer', 'product_name']).count()

In [None]:
# cat all messages of a particular type
dcat = pd.concat(tuple([d['device_info'] for d in m.parsed_data]))
dcat.product_name.unique()

In [None]:
# cat all sport messages (which are only present in some files)
dcat = pd.concat(tuple([d['sport'] for d in m.parsed_data if d.get('sport') is not None]))
dcat.groupby(['sport', 'sub_sport']).count()

In [None]:
# activities without a 'sport' message
no_sport_inds = [ind for ind, d in enumerate(m.parsed_data) if d.get('sport') is None]
len(no_sport_inds)

In [None]:
# activities from the Edge520 without a 'sport' message
d = [d for ind, d in enumerate(m.parsed_data) if ind in no_sport_inds and 'edge520' in d['file_id'].product_name.values]
len(d)

In [None]:
# cat the sport column of the session message for activities without a sport message
dcat = pd.concat(tuple([d['session'][['sport']] for ind, d in enumerate(m.parsed_data) if ind in no_sport_inds]))

In [None]:
(dcat.sport=='running').sum(), (dcat.sport=='cycling').sum()

In [None]:
# cat the sport column for all session messages
dcat = pd.concat(tuple([d['session'][['sport']] for ind, d in enumerate(m.parsed_data)]))

In [None]:
dcat.sport.unique()

In [None]:
# all activities from fr220
fr220 = [d for d in m.parsed_data if 'fr220' in d['device_info']['product_name'].unique()]

In [None]:
# all sets of fields for given message type
message_name = 'device_info'
column_lists = list(set([tuple(d[message_name].columns) for d in m.parsed_data]))
sorted([', '.join(sorted(columns)) for columns in column_lists])

# common fields across all activities
set(column_lists[0]).intersection(*column_lists)

In [None]:
# all sets of record fields (excluding indoor rides)
column_lists = list(set([tuple(d['record'].columns) for d in m.parsed_data if 'position_lat' in d['record'].columns]))
sorted([', '.join(sorted(columns)) for columns in column_lists])

# common fields across all activities
set(column_lists[0]).intersection(*column_lists)

### Parse a Garmin and Wahoo example

In [None]:
# Garmin
filepath = os.path.join(root, 'activities', garmin_example)
dg = cypy2.file_utils.parse_fit(filepath)

# Wahoo
filepath = os.path.join(root, 'activities', wahoo_example)
dw = cypy2.file_utils.parse_fit(filepath)

In [None]:
for key, data in dw.items():
    print('%s: %s' % (key, list(data.columns)))

### FIT file debugging

In [None]:
fitfile = cypy2.file_utils.open_fit(os.path.join(root, 'activities', wahoo_example))

In [None]:
file_id = next(fitfile.get_messages('file_id'))

In [None]:
m.def_mesg.field_defs