In [41]:
import numpy as np
import pandas as pd
import glob
from datetime import datetime
import matplotlib.pyplot as plt
#Seaborn is a data visualization library.
#import seaborn as sns

In [42]:
strava = pd.read_json('../observations/strava/activities.json')
sugarwod = pd.read_csv('../observations/sugarwod/workouts.csv')

Create new Strava Dataframe columns 'date' and 'start_time' from 'start_date_local.'<br/>
This makes it easier to merge SugarWOD and Strava data on a common column.<br/>
We can drop the 'start_date_local' column afterward, we won't need it anymore.

In [43]:
strava['start_date_local'] = pd.to_datetime(strava['start_date_local'])
strava['start_time'] = strava['start_date_local'].dt.strftime('%H:%M:%S')
strava['date'] = strava['start_date_local'].dt.strftime('%Y-%m-%d')
strava = strava.drop(columns=['start_date_local'])

In [44]:
# Convert sugarwod data format to datetime format
sugarwod['date'] = pd.to_datetime(sugarwod['date'])
sugarwod['date'] = sugarwod['date'].dt.strftime('%Y-%m-%d')

In [45]:
# Test: Merge Strava and SugarWOD files together by date field using slimmed down dataframes for testing
strav_test=strava[['date', 'average_heartrate', 'max_heartrate']].copy()
sugar_test=sugarwod[['date', 'score_type', 'pr']].copy()

In [46]:
test = pd.concat([strav_test, sugar_test])

In [47]:
# verify there are some rows where there is data for all fields
test.loc[test['date'] == '2022-07-20']

Unnamed: 0,date,average_heartrate,max_heartrate,score_type,pr
57,2022-07-20,158.1,171.0,,
60,2022-07-20,,,Time,PR


In [48]:
# create a new DataFrame by combining rows with same date values
# as_index 
df_new = test.groupby(test['date'], as_index=False).aggregate('first')

# verify the rows are merged appropriately
df_new.loc[df_new['date'] == '2022-07-20']

Unnamed: 0,date,average_heartrate,max_heartrate,score_type,pr
154,2022-07-20,158.1,171.0,Time,PR


Merge all Wodify files together into one Dataframe

In [49]:
wodify_lifts = pd.read_excel('../observations/wodify/PerformanceResults.xlsx')
wodify_metcons = pd.read_excel('../observations/wodify/PerformanceResultsMetcons.xlsx')
wodify_prs = pd.read_excel('../observations/wodify/PerformanceResultsPRs.xlsx')

In [50]:
wodify_lifts['date'] = wodify_lifts['Date'].dt.strftime('%Y-%m-%d')
wodify_lifts = wodify_lifts.drop(columns=['Date'])
wodify_metcons['date'] = wodify_metcons['Date'].dt.strftime('%Y-%m-%d')
wodify_metcons = wodify_metcons.drop(columns=['Date'])
wodify_prs['Peformance Result Date'] = pd.to_datetime(wodify_prs['Performance Result Date'])
wodify_prs['date'] = wodify_prs['Performance Result Date'].dt.strftime('%Y-%m-%d')
wodify_prs = wodify_prs.drop(columns=['Performance Result Date'])

### TODO: how to merge Dataframes with the same Column names and preserve the data in each?

In [51]:
# Test: Create slimmed down Dataframes for testing purposes

lift_test = wodify_lifts[['date', 'Component', 'Affiliate Name']].copy()
metcon_test = wodify_metcons[['date', 'Component', 'Affiliate Name']].copy()
prs_test = wodify_prs[['date', 'Component Name', 'Result']].copy()
wodify_test = pd.concat([lift_test, metcon_test, prs_test])
prs_test

Unnamed: 0,date,Component Name,Result
0,2023-01-10,Deadlift,1 x 5 @ 275 lbs


In [52]:
# verify there are some rows where there is data for all fields
#
# Use date '2023-01-10' because it is the only date for which 
# there is an entry in all three Dataframes

wodify_test.loc[wodify_test['date'] == '2023-01-10']

Unnamed: 0,date,Component,Affiliate Name,Component Name,Result
4,2023-01-10,Deadlift,CrossFit Cove,,
14,2023-01-10,Marston,CrossFit Cove,,
0,2023-01-10,,,Deadlift,1 x 5 @ 275 lbs


#### Notice how merging dataframes with overlapping columns results in data loss

In [53]:
df_wfy = wodify_test.groupby(wodify_test['date'], as_index=False).aggregate('first')
df_wfy.loc[df_wfy2['date'] == '2023-01-10']

Unnamed: 0,date,Component,Affiliate Name,Component Name,Result
15,2023-01-10,Deadlift,CrossFit Cove,Deadlift,1 x 5 @ 275 lbs


#### Where as if I selected non-overlapping columns...

In [54]:
lift_test2 = wodify_lifts[['date', 'From Weightlifting Total']].copy()
metcon_test2 = wodify_metcons[['date', 'Is Rx']].copy()
prs_test2 = wodify_prs[['date', 'Rep Scheme']].copy()
wodify_test2 = pd.concat([lift_test2, metcon_test2, prs_test2])

In [55]:
wodify_test2.loc[wodify_test2['date'] == '2023-01-10']

Unnamed: 0,date,From Weightlifting Total,Is Rx,Rep Scheme
4,2023-01-10,False,,
14,2023-01-10,,True,
0,2023-01-10,,,Build to a heavy set of 5


In [56]:
df_wfy2 = wodify_test2.groupby(wodify_test2['date'], as_index=False).aggregate('first')
df_wfy2.loc[df_wfy2['date'] == '2023-01-10']

Unnamed: 0,date,From Weightlifting Total,Is Rx,Rep Scheme
15,2023-01-10,False,True,Build to a heavy set of 5


### First attempt to rectify: rename overlapping columns to specify their origin

In [57]:
lift_test['lift_component'] = lift_test['Component']
lift_test = lift_test.drop(columns=['Component'])

metcon_test['metcon_component'] = metcon_test['Component']
metcon_test = metcon_test.drop(columns=['Component'])

prs_test['pr_component'] = prs_test['Component Name']
prs_test = prs_test.drop(columns=['Component Name'])

wodify_test3 = pd.concat([lift_test, metcon_test, prs_test])

In [58]:
wodify_test3.loc[wodify_test3['date'] == '2023-01-10']

Unnamed: 0,date,Affiliate Name,lift_component,metcon_component,Result,pr_component
4,2023-01-10,CrossFit Cove,Deadlift,,,
14,2023-01-10,CrossFit Cove,,Marston,,
0,2023-01-10,,,,1 x 5 @ 275 lbs,Deadlift


In [59]:
df_wfy3 = wodify_test3.groupby(wodify_test3['date'], as_index=False).aggregate('first')
df_wfy3.loc[df_wfy3['date'] == '2023-01-10']

Unnamed: 0,date,Affiliate Name,lift_component,metcon_component,Result,pr_component
15,2023-01-10,CrossFit Cove,Deadlift,Marston,1 x 5 @ 275 lbs,Deadlift
