In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
import json

In [None]:
data = pd.read_csv("data/athletes.csv")

In [3]:
# Remove not relevant columns
data = data.dropna(subset=['region','age','weight','height','howlong','gender','eat', \
                            'train','background','experience','schedule','howlong', \
                            'deadlift','candj','snatch','backsq','experience',\
                            'background','schedule','howlong'])
data = data.drop(columns=['affiliate','team','name','athlete_id','fran','helen','grace',\
                            'filthy50','fgonebad','run400','run5k','pullups','train'])

In [4]:
v1 = data.copy()

In [5]:
# Remove Outliers

data = data[data['weight'] < 1500]
data = data[data['gender'] != '--']
data = data[data['age'] >= 18]
data = data[(data['height'] < 96) & (data['height'] > 48)]

data = data[(data['deadlift'] > 0) & (data['deadlift'] <= 1105)|((data['gender'] == 'Female') \
                & (data['deadlift'] <= 636))]
data = data[(data['candj'] > 0) & (data['candj'] <= 395)]
data = data[(data['snatch'] > 0) & (data['snatch'] <= 496)]
data = data[(data['backsq'] > 0) & (data['backsq'] <= 1069)]

# Clean Survey Data

decline_dict = {'Decline to answer|': np.nan}
data = data.replace(decline_dict)
data = data.dropna(subset=['background','experience','schedule','howlong','eat'])

In [6]:
v2 = data.copy()

In [14]:
v1["total_lift"] = v1.deadlift + v1.candj + v1.snatch + v1.backsq
train_v1, test_v1 = train_test_split(v1, test_size=0.2, random_state=42)
train_v1.to_csv("data/v1/train.csv", index=False)
test_v1.to_csv ("data/v1/test.csv",  index=False)

In [15]:
v2["total_lift"] = v2.deadlift + v2.candj + v2.snatch + v2.backsq
train_v2, test_v2 = train_test_split(v2, test_size=0.2, random_state=42)
train_v2.to_csv("data/v2/train.csv", index=False)
test_v2.to_csv ("data/v2/test.csv",  index=False)

In [None]:
X_train, y_train = train_v1.drop("total_lift",1), train_v1.total_lift
X_test, y_test = test_v1.drop("total_lift",1), test_v1.total_lift
model = RandomForestRegressor(random_state=42).fit(X_train, y_train)
preds = model.predict(X_test)
metrics = {
  "MAE": mean_absolute_error(y_test, preds),
  "RMSE": root_mean_squared_error(y_test, preds, squared=False),
  "R2": r2_score(y_test, preds)
}