In [53]:
import numpy as np
import pandas as pd
import glob
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [54]:
strava = pd.read_json('../observations/strava/activities.json')
sugarwod = pd.read_csv('../observations/sugarwod/workouts.csv')

Create new Strava Dataframe columns 'date' and 'start_time' from 'start_date_local.'<br/>
This makes it easier to merge SugarWOD and Strava data on a common column.<br/>
We can drop the 'start_date_local' column afterward, we won't need it anymore.

In [55]:
strava['start_date_local'] = pd.to_datetime(strava['start_date_local'])
strava['start_time'] = strava['start_date_local'].dt.strftime('%H:%M:%S')
strava['date'] = strava['start_date_local'].dt.strftime('%Y-%m-%d')
strava = strava.drop(columns=['start_date_local'])

In [56]:
# Convert sugarwod data format to datetime format
sugarwod['date'] = pd.to_datetime(sugarwod['date'])
sugarwod['date'] = sugarwod['date'].dt.strftime('%Y-%m-%d')

In [81]:
strava.columns

Index(['resource_state', 'athlete', 'name', 'distance', 'moving_time',
       'elapsed_time', 'total_elevation_gain', 'type', 'sport_type', 'id',
       'start_date', 'timezone', 'utc_offset', 'location_city',
       'location_state', 'location_country', 'achievement_count',
       'kudos_count', 'comment_count', 'athlete_count', 'photo_count', 'map',
       'trainer', 'commute', 'manual', 'private', 'visibility', 'flagged',
       'gear_id', 'start_latlng', 'end_latlng', 'average_speed', 'max_speed',
       'has_heartrate', 'average_heartrate', 'max_heartrate',
       'heartrate_opt_out', 'display_hide_heartrate_option', 'upload_id',
       'upload_id_str', 'external_id', 'from_accepted_tag', 'pr_count',
       'total_photo_count', 'has_kudoed', 'workout_type', 'average_watts',
       'kilojoules', 'device_watts', 'elev_high', 'elev_low', 'start_time',
       'date'],
      dtype='object')

In [82]:
sugarwod.columns

Index(['date', 'title', 'description', 'best_result_raw',
       'best_result_display', 'score_type', 'barbell_lift', 'set_details',
       'notes', 'rx_or_scaled', 'pr'],
      dtype='object')

In [57]:
# Test: Merge Strava and SugarWOD files together by date field using slimmed down dataframes for testing
strav_test=strava[['date', 'average_heartrate', 'max_heartrate']].copy()
sugar_test=sugarwod[['date', 'score_type', 'pr']].copy()

In [58]:
test = pd.concat([strav_test, sugar_test])

In [59]:
# verify there are some rows where there is data for all fields
test.loc[test['date'] == '2022-07-20']

Unnamed: 0,date,average_heartrate,max_heartrate,score_type,pr
57,2022-07-20,158.1,171.0,,
60,2022-07-20,,,Time,PR


In [60]:
# create a new DataFrame by combining rows with same date values
# as_index 
df_new = test.groupby(test['date'], as_index=False).aggregate('first')

# verify the rows are merged appropriately
df_new.loc[df_new['date'] == '2022-07-20']

Unnamed: 0,date,average_heartrate,max_heartrate,score_type,pr
154,2022-07-20,158.1,171.0,Time,PR


Merge all Wodify files together into one Dataframe

In [61]:
wodify_lifts = pd.read_excel('../observations/wodify/PerformanceResults.xlsx')
wodify_metcons = pd.read_excel('../observations/wodify/PerformanceResultsMetcons.xlsx')
wodify_prs = pd.read_excel('../observations/wodify/PerformanceResultsPRs.xlsx')

In [62]:
wodify_lifts['date'] = wodify_lifts['Date'].dt.strftime('%Y-%m-%d')
wodify_lifts = wodify_lifts.drop(columns=['Date'])
wodify_metcons['date'] = wodify_metcons['Date'].dt.strftime('%Y-%m-%d')
wodify_metcons = wodify_metcons.drop(columns=['Date'])
wodify_prs['Peformance Result Date'] = pd.to_datetime(wodify_prs['Performance Result Date'])
wodify_prs['date'] = wodify_prs['Performance Result Date'].dt.strftime('%Y-%m-%d')
wodify_prs = wodify_prs.drop(columns=['Performance Result Date'])

### Problem: how to merge Dataframes with the same Column names and preserve the data in each?

In [63]:
wodify_lifts.columns

Index(['Component', 'Affiliate Name', 'Class Name', 'Result',
       'Is Personal Record', 'Personal Record Description',
       'Performance Result Type', 'Comment', 'From Weightlifting Total',
       'date'],
      dtype='object')

In [64]:
wodify_metcons.columns

Index(['Component', 'Component Description', 'Component(2)', 'Affiliate Name',
       'Class Name', 'Fully Formatted Result', 'Is Personal Record',
       'Personal Record Description', 'Is Rx', 'Is Rx Plus',
       'Result Type Label', 'Full Comment', 'date'],
      dtype='object')

In [65]:
wodify_prs.columns

Index(['Component Name', 'Result', 'Rep Scheme', 'Performance Result Comment',
       'Class Name', 'Personal Record Text', 'Peformance Result Date', 'date'],
      dtype='object')

### 6 Overlapping Columns May Pose Issues 

Date, Component, Affiliate Name, Class Name, Is Personal Record, Personal Record Description.

Look at each of these columns to infer how they may be merged.

In [66]:
lift_overlap = wodify_lifts[['date', 'Component', 'Affiliate Name', 'Class Name', 'Is Personal Record', 'Personal Record Description']].copy()
metcon_overlap = wodify_metcons[['date', 'Component', 'Affiliate Name', 'Class Name', 'Is Personal Record', 'Personal Record Description']].copy()
prs_overlap = wodify_prs[['date', 'Peformance Result Date', 'Class Name']].copy()

In [67]:
lift_overlap.head()

Unnamed: 0,date,Component,Affiliate Name,Class Name,Is Personal Record,Personal Record Description
0,2023-02-08,Back Squat,CrossFit Cove,Cove Fitness: 5:45 PM,False,
1,2023-02-07,Deadlift,CrossFit Cove,Cove Fitness: 5:45 PM,False,
2,2023-01-18,Deadlift,CrossFit Cove,Cove Fitness: Noon,False,
3,2023-01-16,Hang Power Snatch,CrossFit Cove,Cove Fitness: 5:45 PM,False,
4,2023-01-10,Deadlift,CrossFit Cove,Cove Fitness: Noon,True,PR by 20 lbs vs. 255 on 11/02/2022


In [68]:
metcon_overlap.head()

Unnamed: 0,date,Component,Affiliate Name,Class Name,Is Personal Record,Personal Record Description
0,2023-02-10,Rear View,CrossFit Cove,Cove Fitness: 5:45 PM,False,
1,2023-02-09,Wait a Minute,CrossFit Cove,Cove Fitness: 5:45 PM,False,
2,2023-02-08,Tenacious,CrossFit Cove,Cove Fitness: 5:45 PM,False,
3,2023-02-07,Individual Quarter Finals 21.1,CrossFit Cove,Cove Fitness: 5:45 PM,False,
4,2023-02-06,When Randy Met Nancy,CrossFit Cove,Cove Fitness: 5:45 PM,False,


In [69]:
prs_overlap.head()

Unnamed: 0,date,Peformance Result Date,Class Name
0,2023-01-10,2023-01-10,Cove Fitness: Noon


From this view:

'Performance Result Date' of the PRs table can be dropped because it is a duplicate.

'Component' columns of Lift and Metcon tables must be renamed to preserve data. Modify 'Component Name' in PRs too.
    
    rename to 'lift_component', 'metcon_component', 'pr_component'

'Affiliate Name' columns of Lift and Metcon will merge without any loss.

'Class Name' columns of Lift and Metcon will merge, assuming I do just one WOD each day, which is a safe assumption.

'Is Personal Record' and 'Personal Record Description' of Lift and Metcon must be renamed.

    rename to 'is_lift_pr', 'is_metcon_pr', 'lift_pr_description', 'metcon_pr_description'

In [70]:
prs_overlap = prs_overlap.drop(columns=['Peformance Result Date'])
wodify_test = pd.concat([lift_overlap, metcon_overlap, prs_overlap])

In [71]:
# verify there are some rows where there is data for all fields
#
# Use date '2023-01-10' because it is the only date for which 
# there is an entry in all three Dataframes

wodify_test.loc[wodify_test['date'] == '2023-01-10']

Unnamed: 0,date,Component,Affiliate Name,Class Name,Is Personal Record,Personal Record Description
4,2023-01-10,Deadlift,CrossFit Cove,Cove Fitness: Noon,True,PR by 20 lbs vs. 255 on 11/02/2022
14,2023-01-10,Marston,CrossFit Cove,Cove Fitness: Noon,False,
0,2023-01-10,,,Cove Fitness: Noon,,


#### Notice how merging dataframes creates data loss in the overlapping columns Component

It has the potential to lose data in Is Personal Record and Personal Record Description as well because
I may have a PR for both a lift and metcon on the same day.

In [72]:
df_wfy = wodify_test.groupby(wodify_test['date'], as_index=False).aggregate('first')
df_wfy.loc[df_wfy['date'] == '2023-01-10']

Unnamed: 0,date,Component,Affiliate Name,Class Name,Is Personal Record,Personal Record Description
15,2023-01-10,Deadlift,CrossFit Cove,Cove Fitness: Noon,True,PR by 20 lbs vs. 255 on 11/02/2022


#### Where as if I selected non-overlapping columns...

In [73]:
lift_test = wodify_lifts[['date', 'From Weightlifting Total']].copy()
metcon_test = wodify_metcons[['date', 'Is Rx']].copy()
prs_test = wodify_prs[['date', 'Rep Scheme']].copy()
wodify_test = pd.concat([lift_test, metcon_test, prs_test])

In [74]:
wodify_test.loc[wodify_test['date'] == '2023-01-10']

Unnamed: 0,date,From Weightlifting Total,Is Rx,Rep Scheme
4,2023-01-10,False,,
14,2023-01-10,,True,
0,2023-01-10,,,Build to a heavy set of 5


In [75]:
df_wfy = wodify_test.groupby(wodify_test['date'], as_index=False).aggregate('first')
df_wfy.loc[df_wfy['date'] == '2023-01-10']

Unnamed: 0,date,From Weightlifting Total,Is Rx,Rep Scheme
15,2023-01-10,False,True,Build to a heavy set of 5


### First attempt to rectify: rename overlapping columns to specify their origin

In [76]:
# Rename all Component columns to refer specifically to the data set they compromise

lift_overlap['lift_component'] = lift_overlap['Component']
lift_overlap = lift_overlap.drop(columns=['Component'])

metcon_overlap['metcon_component'] = metcon_overlap['Component']
metcon_overlap = metcon_overlap.drop(columns=['Component'])

# Previously dropped Component Name column in this notebook.
# prs_overlap['pr_component'] = prs_overlap['Component Name']
# prs_overlap = prs_overlap.drop(columns=['Component Name'])

# 'Is Personal Record' and 'Personal Record Description' of Lift and Metcon must be renamed.
# rename to 'is_lift_pr', 'is_metcon_pr', 'lift_pr_description', 'metcon_pr_description'

lift_overlap['is_lift_pr'] = lift_overlap['Is Personal Record']
lift_overlap['lift_pr_description'] = lift_overlap['Personal Record Description']
lift_overlap = lift_overlap.drop(columns=['Is Personal Record'])
lift_overlap = lift_overlap.drop(columns=['Personal Record Description'])

metcon_overlap['is_metcon_pr'] = metcon_overlap['Is Personal Record']
metcon_overlap['metcon_pr_description'] = metcon_overlap['Personal Record Description']
metcon_overlap = metcon_overlap.drop(columns=['Is Personal Record'])
metcon_overlap = metcon_overlap.drop(columns=['Personal Record Description'])

In [77]:
wodify_test = pd.concat([lift_overlap, metcon_overlap, prs_overlap])

In [78]:
wodify_test.loc[wodify_test['date'] == '2023-01-10']

Unnamed: 0,date,Affiliate Name,Class Name,lift_component,is_lift_pr,lift_pr_description,metcon_component,is_metcon_pr,metcon_pr_description
4,2023-01-10,CrossFit Cove,Cove Fitness: Noon,Deadlift,True,PR by 20 lbs vs. 255 on 11/02/2022,,,
14,2023-01-10,CrossFit Cove,Cove Fitness: Noon,,,,Marston,False,
0,2023-01-10,,Cove Fitness: Noon,,,,,,


In [80]:
df_wfy2 = wodify_test.groupby(wodify_test['date'], as_index=False).aggregate('first')
df_wfy2.loc[df_wfy2['date'] == '2023-01-10']

Unnamed: 0,date,Affiliate Name,Class Name,lift_component,is_lift_pr,lift_pr_description,metcon_component,is_metcon_pr,metcon_pr_description
15,2023-01-10,CrossFit Cove,Cove Fitness: Noon,Deadlift,True,PR by 20 lbs vs. 255 on 11/02/2022,Marston,False,
