In [1]:
import pandas as pd
import numpy as np
# to use preprocessing functions, add the path to current folder
import sys
sys.path.append("../../")
from modeling.preprocessing import convert_to_datetime, convert_units

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('2023_BenchMarkStats_men3.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,Back Squat,Chad1000x,Clean and Jerk,Deadlift,Fight Gone Bad,Filthy 50,Fran,Grace,Helen,L1 Benchmark,Max Pull-ups,Run 5k,Snatch,Sprint 400m
0,0,469656,Jeffrey Adler,475 lb,--,377 lb,567 lb,511,15:17,2:02,1:16,7:18,--,54,19:15,290 lb,0:59
1,1,300638,Tola Morakinyo,500 lb,--,390 lb,615 lb,--,--,--,--,--,--,--,--,340 lb,--
2,2,676693,Colten Mertens,555 lb,--,335 lb,545 lb,--,--,--,--,--,--,--,--,275 lb,--
3,3,663689,Tyler Christophel,--,--,--,--,--,--,--,--,--,--,--,--,--,--
4,4,1031875,Roldan Goldbaum,445 lb,--,350 lb,515 lb,--,--,2:10,1:07,--,--,--,18:00,280 lb,0:56


In [3]:
df = df.replace("--", np.nan)

In [38]:
exercise_cols = ['Back Squat', 'Chad1000x', 'Clean and Jerk',
       'Deadlift', 'Fight Gone Bad', 'Filthy 50', 'Fran', 'Grace', 'Helen',
       'L1 Benchmark', 'Max Pull-ups', 'Run 5k', 'Snatch', 'Sprint 400m']

## Convert data types

In [34]:
# convert weight columns to float (in lb)
convert_units(df,type =  'weight', columns=["Deadlift", "Clean and Jerk","Back Squat","Snatch"])

Unnamed: 0.1,Unnamed: 0,id,name,Back Squat,Chad1000x,Clean and Jerk,Deadlift,Fight Gone Bad,Filthy 50,Fran,Grace,Helen,L1 Benchmark,Max Pull-ups,Run 5k,Snatch,Sprint 400m
0,0,469656,Jeffrey Adler,475.0,,377.0,567.0,,917.0,122.0,76.0,438.0,,54.0,1155.0,290.0,59.0
1,1,300638,Tola Morakinyo,500.0,,390.0,615.0,,,,,,,,,340.0,
2,2,676693,Colten Mertens,555.0,,335.0,545.0,,,,,,,,,275.0,
3,3,663689,Tyler Christophel,,,,,,,,,,,,,,
4,4,1031875,Roldan Goldbaum,445.0,,350.0,515.0,,,130.0,67.0,,,,1080.0,280.0,56.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77222,77222,1594361,Erik Davidson,,,,,,,,,,,,,,
77223,77223,1768926,Ammar El-wardani,,,,,,,,,,,,,,
77224,77224,205540,Doug Shinstrom,400.0,,297.0,455.0,,,193.0,133.0,,,25.0,,215.0,98.0
77225,77225,2240531,Daniel Nevarez,,,,,,,,,,,,,,


In [35]:
# convert time columns
def convert_to_timedelta_single(x):
    if pd.isnull(x):
        return pd.NaT
    if not isinstance(x, str):
        print(x)
        return pd.NaT
    # get last 5 characters (anything bigger is an error)
    try:
        x = x[-5:]
        full_str = "00:00:00"
        x = full_str[0:(8-len(x))] + x

        return pd.Timedelta(x)
    except:
        print(x)
        return pd.NaT

def convert_to_seconds(df, columns = None):
    if columns is None:
        columns = df.columns
    
    for col in columns:
        # if its an object or string
        if df[col].dtype == 'O':
            df[col] = df[col].apply(convert_to_timedelta_single)
            df[col] = df[col].dt.total_seconds()
    return df
time_cols = ["Fran",'Helen','Grace', 'Filthy 50', 'Fight Gone Bad', 'Sprint 400m', 'Run 5k','Chad1000x','L1 Benchmark']
convert_to_seconds(df, columns = time_cols)


Unnamed: 0.1,Unnamed: 0,id,name,Back Squat,Chad1000x,Clean and Jerk,Deadlift,Fight Gone Bad,Filthy 50,Fran,Grace,Helen,L1 Benchmark,Max Pull-ups,Run 5k,Snatch,Sprint 400m
0,0,469656,Jeffrey Adler,475.0,,377.0,567.0,,917.0,122.0,76.0,438.0,,54.0,1155.0,290.0,59.0
1,1,300638,Tola Morakinyo,500.0,,390.0,615.0,,,,,,,,,340.0,
2,2,676693,Colten Mertens,555.0,,335.0,545.0,,,,,,,,,275.0,
3,3,663689,Tyler Christophel,,,,,,,,,,,,,,
4,4,1031875,Roldan Goldbaum,445.0,,350.0,515.0,,,130.0,67.0,,,,1080.0,280.0,56.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77222,77222,1594361,Erik Davidson,,,,,,,,,,,,,,
77223,77223,1768926,Ammar El-wardani,,,,,,,,,,,,,,
77224,77224,205540,Doug Shinstrom,400.0,,297.0,455.0,,,193.0,133.0,,,25.0,,215.0,98.0
77225,77225,2240531,Daniel Nevarez,,,,,,,,,,,,,,


In [36]:
# convert reps to int
df['Max Pull-ups'] = df['Max Pull-ups'].astype(float)

## Sanity checking data

In [40]:
for col in exercise_cols:
    print(f"col {col} : {df.loc[:,col].unique()[:10]}")

# print any that are not numbers
for col in exercise_cols:
   if df[col].dtype == 'O':
       print(f"NON numeric col {col}")

col Back Squat : [475.     500.     555.          nan 445.     435.     451.9471 425.
 440.     535.    ]
col Chad1000x : [  nan 3319. 3420. 3360. 3305. 4843. 4855. 3720. 4372. 3810.]
col Clean and Jerk : [377.     390.     335.          nan 350.     370.     363.7623 360.
 347.     375.    ]
col Deadlift : [567.     615.     545.          nan 515.     530.     529.1088 550.
 480.     540.    ]
col Fight Gone Bad : [nan 15. 38. 44.  7.  1. 25. 21. 28. 50.]
col Filthy 50 : [ 917.   nan  890. 1050. 1202.  905. 1157. 1145. 1310.  930.]
col Fran : [122.  nan 130. 125. 127. 110. 135. 120. 126. 134.]
col Grace : [ 76.  nan  67.  90.  70. 118.  75.  72. 105. 102.]
col Helen : [438.  nan 405. 418. 435. 433.  71. 486. 420. 454.]
col L1 Benchmark : [ nan 163. 213. 162. 180. 201. 191. 184. 211. 206.]
col Max Pull-ups : [54. nan 75. 50. 82. 68. 60. 55. 70. 85.]
col Run 5k : [1155.   nan 1080. 1052. 1160. 1136. 1105. 1142. 1196. 1140.]
col Snatch : [290.     340.     275.          nan 280.     305.

In [42]:
df[exercise_cols].mean(axis = 0)

Back Squat         468.104702
Chad1000x         3746.771951
Clean and Jerk     327.717131
Deadlift           525.140343
Fight Gone Bad      21.440000
Filthy 50         1480.865641
Fran               243.537834
Grace              180.139427
Helen              578.966379
L1 Benchmark       343.008065
Max Pull-ups       321.001718
Run 5k            1344.961331
Snatch             271.522738
Sprint 400m        110.932899
dtype: float64

In [43]:
df.to_csv('2023_BenchMarkStats_men3_cleaned.csv', index = False)