In [None]:
import pandas as pd
import numpy as np
# to use preprocessing functions, add the path to current folder
import sys
from wod_predictor.feature_engineering_parts.helpers import convert_units

In [None]:
df = pd.read_csv('benchmark_stats_scraped.csv')
# show a few with fully populated columns
non_empty = df['Back Squat'].notna()
df[non_empty].head()

In [3]:
df = df.replace("--", np.nan)


In [14]:
exercise_cols = ['Back Squat', 'Chad1000x', 'Clean and Jerk',
       'Deadlift', 'Fight Gone Bad', 'Filthy 50', 'Fran', 'Grace', 'Helen',
       'L1 Benchmark', 'Max Pull-ups', 'Run 5k', 'Snatch', 'Sprint 400m']

## Convert data types

In [None]:
# convert weight columns to float (in lb)
df = convert_units(df,type =  'weight', columns=["Deadlift", "Clean and Jerk","Back Squat","Snatch"])
df[non_empty].head()

In [None]:
# convert time columns
def convert_to_timedelta_single(x):
    if pd.isnull(x):
        return pd.NaT
    if not isinstance(x, str):
        print(x)
        return pd.NaT
    # get last 5 characters (anything bigger is an error)
    try:
        x = x[-5:]
        full_str = "00:00:00"
        x = full_str[0:(8-len(x))] + x

        return pd.Timedelta(x)
    except:
        print(x)
        return pd.NaT

def convert_to_seconds(df, columns = None):
    if columns is None:
        columns = df.columns
    
    for col in columns:
        # if its an object or string
        if df[col].dtype == 'O':
            df[col] = df[col].apply(convert_to_timedelta_single)
            df[col] = df[col].dt.total_seconds()
    return df
time_cols = ["Fran",'Helen','Grace', 'Filthy 50', 'Fight Gone Bad', 'Sprint 400m', 'Run 5k','Chad1000x','L1 Benchmark']
df = convert_to_seconds(df, columns = time_cols)
df[non_empty].head()

In [28]:
# convert reps to int
df['Max Pull-ups'] = df['Max Pull-ups'].astype(float)

## Sanity checking data

In [None]:
for col in exercise_cols:
    print(f"col {col} : {df.loc[:,col].unique()[:10]}")

# print any that are not numbers
for col in exercise_cols:
   if df[col].dtype == 'O':
       print(f"NON numeric col {col}")

In [None]:
df[exercise_cols].mean(axis = 0)

## Save cleaned dataset to file

In [None]:
# load old data
old_cleaned = pd.read_csv("/Users/hassan/Documents/wod-prediction/WOD-prediction/Data/benchmark_stats/Benchmark_stats_cleaned.csv")
old_cleaned.shape

In [None]:
# combine the two dataframes
combined_df = pd.concat([old_cleaned,df], axis = 0)
# drop duplicates
combined_df = combined_df.drop_duplicates()
combined_df.head()

In [46]:
# overwrite old data
combined_df.to_csv("/Users/hassan/Documents/wod-prediction/WOD-prediction/Data/benchmark_stats/Benchmark_stats_cleaned.csv", index = False)