In [19]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
import xgboost as xg

In [20]:
# Function to perform one-hot encoding

def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [21]:
# Import model training data
data = pd.read_csv('../data/model_training_data.csv')

# Separate features from target
X, y = data[['Sex', 'Age', 'BodyweightKg', 'Best3SquatKg', 'Best3BenchKg']], data['Best3DeadliftKg']

features_to_encode = ['Sex']
for feature in features_to_encode:
    X = encode_and_bind(X, feature)

X


Unnamed: 0,Age,BodyweightKg,Best3SquatKg,Best3BenchKg,Sex_F,Sex_M
0,13.0,63.20,120.0,77.5,0,1
1,13.0,63.20,120.0,77.5,0,1
2,13.0,44.20,62.5,42.5,1,0
3,13.0,65.90,105.0,60.0,1,0
4,13.0,94.30,140.0,110.0,0,1
...,...,...,...,...,...,...
93451,85.0,80.60,65.0,80.0,0,1
93452,85.5,88.70,37.5,55.0,0,1
93453,90.0,71.34,20.0,30.0,1,0
93454,91.5,70.40,20.0,30.0,1,0


In [22]:
# Splitting
train_X, test_X, train_y, test_y = train_test_split(X, y,
                      test_size = 0.3, random_state = 123)
  
# Instantiation
xgb_r = xg.XGBRegressor(objective ='reg:squarederror',
                  n_estimators = 200, seed = 123)
  
# Fitting the model
xgb_r.fit(train_X, train_y)
  
# Predict the model
pred = xgb_r.predict(test_X)
  
# MSE and RMSE Computation
mse = MSE(test_y, pred)
r2 = R2(test_y, pred)
rmse = np.sqrt(mse)
print(f'r2: {r2}')
print("MSE: % f" %(mse))
print("RMSE: % f" %(rmse))

r2: 0.910431989639151
MSE:  287.659073
RMSE:  16.960515


In [None]:
# Test more XGBoost models, tune parameters

# Run GridSearchCV with param grid of n_estimators and other params