In [1]:
from datetime import datetime

import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch

### ========================================================

# Merge SugarWOD and Strava into one Dataframe

### ========================================================

In [2]:
strava = pd.read_json('../observations/strava/activities.json')
sugarwod = pd.read_csv('../observations/sugarwod/workouts.csv')

Create new Strava Dataframe columns 'date' and 'start_time' from 'start_date_local.'<br/>
This makes it easier to merge SugarWOD and Strava data on a common column.<br/>
We can drop the 'start_date_local' column afterward, we won't need it anymore.

In [3]:
strava['start_date_local'] = pd.to_datetime(strava['start_date_local'])
strava['start_time'] = strava['start_date_local'].dt.strftime('%H:%M:%S')
strava['date'] = strava['start_date_local'].dt.strftime('%Y-%m-%d')
strava = strava.drop(columns=['start_date_local'])

In [4]:
# Convert sugarwod data format to datetime format
sugarwod['date'] = pd.to_datetime(sugarwod['date'])
sugarwod['date'] = sugarwod['date'].dt.strftime('%Y-%m-%d')

In [5]:
strava.columns

Index(['resource_state', 'athlete', 'name', 'distance', 'moving_time',
       'elapsed_time', 'total_elevation_gain', 'type', 'sport_type', 'id',
       'start_date', 'timezone', 'utc_offset', 'location_city',
       'location_state', 'location_country', 'achievement_count',
       'kudos_count', 'comment_count', 'athlete_count', 'photo_count', 'map',
       'trainer', 'commute', 'manual', 'private', 'visibility', 'flagged',
       'gear_id', 'start_latlng', 'end_latlng', 'average_speed', 'max_speed',
       'has_heartrate', 'average_heartrate', 'max_heartrate',
       'heartrate_opt_out', 'display_hide_heartrate_option', 'upload_id',
       'upload_id_str', 'external_id', 'from_accepted_tag', 'pr_count',
       'total_photo_count', 'has_kudoed', 'workout_type', 'average_watts',
       'kilojoules', 'device_watts', 'elev_high', 'elev_low', 'start_time',
       'date'],
      dtype='object')

In [6]:
sugarwod.columns

Index(['date', 'title', 'description', 'best_result_raw',
       'best_result_display', 'score_type', 'barbell_lift', 'set_details',
       'notes', 'rx_or_scaled', 'pr'],
      dtype='object')

In [7]:
# Test: Merge Strava and SugarWOD files together by date field using slimmed down dataframes for testing
strav_test=strava[['date', 'average_heartrate', 'max_heartrate']].copy()
sugar_test=sugarwod[['date', 'score_type', 'pr']].copy()

In [8]:
test = pd.concat([strav_test, sugar_test])

In [9]:
# verify there are some rows where there is data for all fields
test.loc[test['date'] == '2022-07-20']

Unnamed: 0,date,average_heartrate,max_heartrate,score_type,pr
57,2022-07-20,158.1,171.0,,
60,2022-07-20,,,Time,PR


In [10]:
# create a new DataFrame by combining rows with same date values
# as_index 
df_new = test.groupby(test['date'], as_index=False).aggregate('first')

# verify the rows are merged appropriately
df_new.loc[df_new['date'] == '2022-07-20']

Unnamed: 0,date,average_heartrate,max_heartrate,score_type,pr
154,2022-07-20,158.1,171.0,Time,PR


### ========================================================

# Merge all Wodify files together into one Dataframe

### ========================================================

In [11]:
wodify_lifts = pd.read_excel('../observations/wodify/PerformanceResults.xlsx')
wodify_metcons = pd.read_excel('../observations/wodify/PerformanceResultsMetcons.xlsx')
wodify_prs = pd.read_excel('../observations/wodify/PerformanceResultsPRs.xlsx')

In [12]:
wodify_lifts['date'] = wodify_lifts['Date'].dt.strftime('%Y-%m-%d')
wodify_lifts = wodify_lifts.drop(columns=['Date'])
wodify_metcons['date'] = wodify_metcons['Date'].dt.strftime('%Y-%m-%d')
wodify_metcons = wodify_metcons.drop(columns=['Date'])
wodify_prs['Peformance Result Date'] = pd.to_datetime(wodify_prs['Performance Result Date'])
wodify_prs['date'] = wodify_prs['Performance Result Date'].dt.strftime('%Y-%m-%d')
wodify_prs = wodify_prs.drop(columns=['Performance Result Date'])

### Problem: how to merge Dataframes with the same Column names and preserve the data in each?

In [13]:
wodify_lifts.columns

Index(['Component', 'Affiliate Name', 'Class Name', 'Result',
       'Is Personal Record', 'Personal Record Description',
       'Performance Result Type', 'Comment', 'From Weightlifting Total',
       'date'],
      dtype='object')

In [14]:
wodify_metcons.columns

Index(['Component', 'Component Description', 'Component(2)', 'Affiliate Name',
       'Class Name', 'Fully Formatted Result', 'Is Personal Record',
       'Personal Record Description', 'Is Rx', 'Is Rx Plus',
       'Result Type Label', 'Full Comment', 'date'],
      dtype='object')

In [15]:
wodify_prs.columns

Index(['Component Name', 'Result', 'Rep Scheme', 'Performance Result Comment',
       'Class Name', 'Personal Record Text', 'Peformance Result Date', 'date'],
      dtype='object')

### 6 Overlapping Columns May Pose Issues 

Date, Component, Affiliate Name, Class Name, Is Personal Record, Personal Record Description.

Look at each of these columns to infer how they may be merged.

In [16]:
lift_overlap = wodify_lifts[['date', 'Component', 'Affiliate Name', 'Class Name', 'Is Personal Record', 'Personal Record Description']].copy()
metcon_overlap = wodify_metcons[['date', 'Component', 'Affiliate Name', 'Class Name', 'Is Rx', 'Is Personal Record', 'Personal Record Description']].copy()
prs_overlap = wodify_prs[['date', 'Peformance Result Date', 'Class Name']].copy()

In [17]:
lift_overlap.head()

Unnamed: 0,date,Component,Affiliate Name,Class Name,Is Personal Record,Personal Record Description
0,2023-02-08,Back Squat,CrossFit Cove,Cove Fitness: 5:45 PM,False,
1,2023-02-07,Deadlift,CrossFit Cove,Cove Fitness: 5:45 PM,False,
2,2023-01-18,Deadlift,CrossFit Cove,Cove Fitness: Noon,False,
3,2023-01-16,Hang Power Snatch,CrossFit Cove,Cove Fitness: 5:45 PM,False,
4,2023-01-10,Deadlift,CrossFit Cove,Cove Fitness: Noon,True,PR by 20 lbs vs. 255 on 11/02/2022


In [18]:
metcon_overlap.head()

Unnamed: 0,date,Component,Affiliate Name,Class Name,Is Rx,Is Personal Record,Personal Record Description
0,2023-02-10,Rear View,CrossFit Cove,Cove Fitness: 5:45 PM,True,False,
1,2023-02-09,Wait a Minute,CrossFit Cove,Cove Fitness: 5:45 PM,False,False,
2,2023-02-08,Tenacious,CrossFit Cove,Cove Fitness: 5:45 PM,True,False,
3,2023-02-07,Individual Quarter Finals 21.1,CrossFit Cove,Cove Fitness: 5:45 PM,False,False,
4,2023-02-06,When Randy Met Nancy,CrossFit Cove,Cove Fitness: 5:45 PM,False,False,


In [19]:
prs_overlap.head()

Unnamed: 0,date,Peformance Result Date,Class Name
0,2023-01-10,2023-01-10,Cove Fitness: Noon


From this view:

'Performance Result Date' of the PRs table can be dropped because it is a duplicate.

'Component' columns of Lift and Metcon tables must be renamed to preserve data. Modify 'Component Name' in PRs too.
    
    rename to 'lift_component', 'metcon_component', 'pr_component'

'Affiliate Name' columns of Lift and Metcon will merge without any loss.

'Class Name' columns of Lift and Metcon will merge, assuming I do just one WOD each day, which is a safe assumption.

'Is Personal Record' and 'Personal Record Description' of Lift and Metcon must be renamed.

    rename to 'is_lift_pr', 'is_metcon_pr', 'lift_pr_description', 'metcon_pr_description'

In [20]:
prs_overlap = prs_overlap.drop(columns=['Peformance Result Date'])
wodify_test = pd.concat([lift_overlap, metcon_overlap, prs_overlap])

In [21]:
# verify there are some rows where there is data for all fields
#
# Use date '2023-01-10' because it is the only date for which 
# there is an entry in all three Dataframes

wodify_test.loc[wodify_test['date'] == '2023-01-10']

Unnamed: 0,date,Component,Affiliate Name,Class Name,Is Personal Record,Personal Record Description,Is Rx
4,2023-01-10,Deadlift,CrossFit Cove,Cove Fitness: Noon,True,PR by 20 lbs vs. 255 on 11/02/2022,
14,2023-01-10,Marston,CrossFit Cove,Cove Fitness: Noon,False,,True
0,2023-01-10,,,Cove Fitness: Noon,,,


#### Notice how merging dataframes creates data loss in the overlapping columns Component

It has the potential to lose data in Is Personal Record and Personal Record Description as well because
I may have a PR for both a lift and metcon on the same day.

In [22]:
df_wfy = wodify_test.groupby(wodify_test['date'], as_index=False).aggregate('first')
df_wfy.loc[df_wfy['date'] == '2023-01-10']

Unnamed: 0,date,Component,Affiliate Name,Class Name,Is Personal Record,Personal Record Description,Is Rx
15,2023-01-10,Deadlift,CrossFit Cove,Cove Fitness: Noon,True,PR by 20 lbs vs. 255 on 11/02/2022,True


#### Where as if I selected non-overlapping columns...

In [23]:
lift_test = wodify_lifts[['date', 'From Weightlifting Total']].copy()
metcon_test = wodify_metcons[['date', 'Is Rx']].copy()
prs_test = wodify_prs[['date', 'Rep Scheme']].copy()
wodify_test = pd.concat([lift_test, metcon_test, prs_test])

In [24]:
wodify_test.loc[wodify_test['date'] == '2023-01-10']

Unnamed: 0,date,From Weightlifting Total,Is Rx,Rep Scheme
4,2023-01-10,False,,
14,2023-01-10,,True,
0,2023-01-10,,,Build to a heavy set of 5


In [25]:
df_wfy = wodify_test.groupby(wodify_test['date'], as_index=False).aggregate('first')
df_wfy.loc[df_wfy['date'] == '2023-01-10']

Unnamed: 0,date,From Weightlifting Total,Is Rx,Rep Scheme
15,2023-01-10,False,True,Build to a heavy set of 5


### First attempt to rectify: rename overlapping columns to specify their origin

In [26]:
# Rename all Component columns to refer specifically to the data set they compromise

lift_overlap['lift_component'] = lift_overlap['Component']
lift_overlap = lift_overlap.drop(columns=['Component'])

metcon_overlap['metcon_component'] = metcon_overlap['Component']
metcon_overlap = metcon_overlap.drop(columns=['Component'])

# Previously dropped Component Name column in this notebook.
# prs_overlap['pr_component'] = prs_overlap['Component Name']
# prs_overlap = prs_overlap.drop(columns=['Component Name'])

# 'Is Personal Record' and 'Personal Record Description' of Lift and Metcon must be renamed.
# rename to 'is_lift_pr', 'is_metcon_pr', 'lift_pr_description', 'metcon_pr_description'

lift_overlap['is_lift_pr'] = lift_overlap['Is Personal Record']
lift_overlap['lift_pr_description'] = lift_overlap['Personal Record Description']
lift_overlap = lift_overlap.drop(columns=['Is Personal Record'])
lift_overlap = lift_overlap.drop(columns=['Personal Record Description'])

metcon_overlap['is_metcon_pr'] = metcon_overlap['Is Personal Record']
metcon_overlap['metcon_pr_description'] = metcon_overlap['Personal Record Description']
metcon_overlap = metcon_overlap.drop(columns=['Is Personal Record'])
metcon_overlap = metcon_overlap.drop(columns=['Personal Record Description'])

In [27]:
wodify_test = pd.concat([lift_overlap, metcon_overlap, prs_overlap])

In [28]:
wodify_test.loc[wodify_test['date'] == '2023-01-10']

Unnamed: 0,date,Affiliate Name,Class Name,lift_component,is_lift_pr,lift_pr_description,Is Rx,metcon_component,is_metcon_pr,metcon_pr_description
4,2023-01-10,CrossFit Cove,Cove Fitness: Noon,Deadlift,True,PR by 20 lbs vs. 255 on 11/02/2022,,,,
14,2023-01-10,CrossFit Cove,Cove Fitness: Noon,,,,True,Marston,False,
0,2023-01-10,,Cove Fitness: Noon,,,,,,,


In [29]:
df_wfy2 = wodify_test.groupby(wodify_test['date'], as_index=False).aggregate('first')
df_wfy2.loc[df_wfy2['date'] == '2023-01-10']

Unnamed: 0,date,Affiliate Name,Class Name,lift_component,is_lift_pr,lift_pr_description,Is Rx,metcon_component,is_metcon_pr,metcon_pr_description
15,2023-01-10,CrossFit Cove,Cove Fitness: Noon,Deadlift,True,PR by 20 lbs vs. 255 on 11/02/2022,True,Marston,False,


### ========================================================

# Merge Wodify, SugarWOD and Strava 

### ========================================================

#### Fill NaNs with the Mode of each column

In [100]:
strava_index=strava[['date', 'average_heartrate', 'max_heartrate', 'start_time', 'moving_time']].copy()
strava_index['start_time'] = pd.to_timedelta(strava_index['start_time']).dt.total_seconds()

modes = strava_index.mode().iloc[0]
strava_index.fillna(modes, inplace=True)
strava_index.isna().sum()

# strava_index.head()

date                 0
average_heartrate    0
max_heartrate        0
start_time           0
moving_time          0
dtype: int64

In [101]:
sugar_index=sugarwod[['date', 'rx_or_scaled', 'pr']].copy()

sugar_index['is_rx'] = sugar_index['rx_or_scaled']
sugar_index = sugar_index.drop(columns=['rx_or_scaled'])

sugar_index = sugar_index.replace(to_replace="SCALED", value=0.0)
sugar_index = sugar_index.replace(to_replace="RX", value=1.0)

sugar_index = sugar_index.replace(to_replace="PR", value=1.0)
sugar_index['pr'].fillna(0.0, inplace=True)

sugar_index.isna().sum()
# sugar_index['pr'].dtype
# sugar_index.head()

date     0
pr       0
is_rx    0
dtype: int64

In [102]:
wodify_index = df_wfy2[['date', 'Is Rx', 'is_metcon_pr']].copy()

wodify_index['is_rx'] = wodify_index['Is Rx']
wodify_index = wodify_index.drop(columns=['Is Rx'])

wodify_index['pr'] = wodify_index['is_metcon_pr']
wodify_index = wodify_index.drop(columns=['is_metcon_pr'])

wodify_index = wodify_index.replace(to_replace=False, value=0.0)
wodify_index = wodify_index.replace(to_replace=True, value=1)

wodify_index = wodify_index.replace(to_replace="PR", value=1.0)
wodify_index['pr'].fillna(0.0, inplace=True)

wodify_index.isna().sum()
# wodify_index['pr'].dtype
# wodify_index.head()

date     0
is_rx    0
pr       0
dtype: int64

#### To start as simple as possible, then build up, every column should be numeric

In [106]:
test = pd.concat([strava_index, sugar_index, wodify_index])

df = test.groupby(test['date'], as_index=False).aggregate('first')
df['is_rx'].fillna(0.0, inplace=True)
df['pr'].fillna(0.0, inplace=True)

# After the merge of wodify, strava, sugarwod I found there were NaNs again
modes = df.mode().iloc[0]
df.fillna(modes, inplace=True)
# df.isna().sum()

df.head()

Unnamed: 0,date,average_heartrate,max_heartrate,start_time,moving_time,pr,is_rx
0,2021-07-26,151.5,169.0,61282.0,453.0,0.0,0.0
1,2021-08-04,159.0,173.0,63351.0,1034.0,0.0,0.0
2,2021-08-10,157.2,166.0,61713.0,543.0,0.0,0.0
3,2021-08-14,142.0,167.0,27416.0,6506.0,0.0,0.0
4,2021-08-18,156.1,173.0,61004.0,994.0,0.0,0.0


### ========================================================

# First Model: Predict AM or PM workout with Linear Model

### ========================================================

#### Drop date column for this model because it is not numeric
#### Derive binary AM column from start_time: if start_time < 43200.0 value is 1, else 0
#### Drop start_time column

In [107]:
df['am'] = np.where(df['start_time'] < 43200.0, 1.0, 0.0)
df = df.drop(columns=['date'])
df = df.drop(columns=['start_time'])
df.head()

Unnamed: 0,average_heartrate,max_heartrate,moving_time,pr,is_rx,am
0,151.5,169.0,453.0,0.0,0.0,0.0
1,159.0,173.0,1034.0,0.0,0.0,0.0
2,157.2,166.0,543.0,0.0,0.0,0.0
3,142.0,167.0,6506.0,0.0,0.0,1.0
4,156.1,173.0,994.0,0.0,0.0,0.0


#### Create PyTorch Tensors for Dependent (Target) and Independent (Predictors) variables

In [108]:
from torch import tensor

t_dep = tensor(df.am)

indep_cols = ['average_heartrate', 'max_heartrate', 'moving_time', 'pr', 'is_rx']

t_indep = tensor(df[indep_cols].values, dtype=torch.float64)

t_indep

tensor([[1.5150e+02, 1.6900e+02, 4.5300e+02, 0.0000e+00, 0.0000e+00],
        [1.5900e+02, 1.7300e+02, 1.0340e+03, 0.0000e+00, 0.0000e+00],
        [1.5720e+02, 1.6600e+02, 5.4300e+02, 0.0000e+00, 0.0000e+00],
        ...,
        [1.2830e+02, 1.7100e+02, 6.0100e+02, 0.0000e+00, 1.0000e+00],
        [1.2830e+02, 1.7100e+02, 6.0100e+02, 0.0000e+00, 0.0000e+00],
        [1.2830e+02, 1.7100e+02, 6.0100e+02, 0.0000e+00, 1.0000e+00]],
       dtype=torch.float64)

#### Pick random weights (coefficients) for each column of the independent variable matrix

In [109]:
torch.manual_seed(442)

n_coeff = t_indep.shape[1]
coeffs = torch.rand(n_coeff)-0.5
coeffs

tensor([-0.4629,  0.1386,  0.2409, -0.2262, -0.2632])

#### Make all values in all columns between 0 and 1 by dividing each column by its maximum value.
#### This removes the problem of one column dominating all the others.

In [112]:
vals,indices = t_indep.max(dim=0)

# use 'Broadcasting' divide a matrix by a vector
t_indep = t_indep / vals

#### Calculate predictions by multiplying each row by the coefficients and adding them up.

In [113]:
preds = (t_indep*coeffs).sum(axis=1)
preds[:4]

tensor([-0.2757, -0.2859, -0.2922, -0.1788], dtype=torch.float64)

#### We need a loss function in order to use gradient descent: Mean Absolute Error

In [115]:
loss = torch.abs(preds-t_dep).mean()
loss

tensor(1.1266, dtype=torch.float64)

In [133]:
def calc_preds(coeffs, indeps): return torch.sigmoid((indeps*coeffs).sum(axis=1))
def calc_loss(coeffs, indeps, deps): return torch.abs(calc_preds(coeffs, indeps)-deps).mean()

In [134]:
coeffs.requires_grad_()

tensor([ 0.2607,  0.9456,  0.2593, -0.0465,  0.0384], requires_grad=True)

In [135]:
loss = calc_loss(coeffs, t_indep, t_dep)
loss.backward()
with torch.no_grad():
    coeffs.sub_(coeffs.grad * 0.1)
    coeffs.grad.zero_()
    print(calc_loss(coeffs, t_indep, t_dep))

tensor(0.3263, dtype=torch.float64)


#### Create a Validation and Test split for the data

In [136]:
from fastai.data.transforms import RandomSplitter
trn_split,val_split=RandomSplitter(seed=42)(df)

In [137]:
trn_indep,val_indep = t_indep[trn_split],t_indep[val_split]
trn_dep,val_dep = t_dep[trn_split],t_dep[val_split]
len(trn_indep),len(val_indep)

(187, 46)

In [138]:
def update_coeffs(coeffs, lr):
    coeffs.sub_(coeffs.grad * lr)
    coeffs.grad.zero_()
    
def one_epoch(coeffs, lr):
    loss = calc_loss(coeffs, trn_indep, trn_dep)
    loss.backward()
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end="; ")
    
def init_coeffs(): return (torch.rand(n_coeff)-0.5).requires_grad_()

In [139]:
def train_model(epochs=30, lr=0.01):
    torch.manual_seed(442)
    coeffs = init_coeffs()
    for i in range(epochs): one_epoch(coeffs, lr=lr)
    return coeffs

In [140]:
coeffs = train_model(18, lr=0.2)

0.551; 0.543; 0.534; 0.525; 0.516; 0.507; 0.498; 0.490; 0.481; 0.472; 0.463; 0.454; 0.446; 0.437; 0.429; 0.421; 0.413; 0.405; 

In [141]:
def show_coeffs(): return dict(zip(indep_cols, coeffs.requires_grad_(False)))
show_coeffs()

{'average_heartrate': tensor(0.0312),
 'max_heartrate': tensor(0.6777),
 'moving_time': tensor(0.2984),
 'pr': tensor(-0.1743),
 'is_rx': tensor(-0.1502)}

#### Measure Accuracy

In [142]:
def acc(coeffs): return (val_dep.bool()==(calc_preds(coeffs, val_indep)>0.5)).float().mean()
acc(coeffs)

tensor(0.8478)