In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
import xgboost as xgb

In [12]:
# Import model training data
data = pd.read_csv('../data/encoded_training_data.csv')

# Separate features from target
X, y = data[['Sex_F', 'Sex_M', 'Age', 'BodyweightKg', 'Best3SquatKg', 'Best3BenchKg']], data['Best3DeadliftKg']
X

Unnamed: 0,Sex_F,Sex_M,Age,BodyweightKg,Best3SquatKg,Best3BenchKg
0,0,1,13.0,63.20,120.0,77.5
1,0,1,13.0,63.20,120.0,77.5
2,1,0,13.0,44.20,62.5,42.5
3,1,0,13.0,65.90,105.0,60.0
4,0,1,13.0,94.30,140.0,110.0
...,...,...,...,...,...,...
93451,0,1,85.0,80.60,65.0,80.0
93452,0,1,85.5,88.70,37.5,55.0
93453,1,0,90.0,71.34,20.0,30.0
93454,1,0,91.5,70.40,20.0,30.0


In [39]:
# Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y,
                      test_size = 0.3, random_state = 123)
  
# Instantiation
xgb_r = xgb.XGBRegressor(objective ='reg:squarederror',
                  n_estimators = 200, seed = 123)
  
# Fitting the model
xgb_r.fit(X_train, y_train)
  
# Predict the model
pred = xgb_r.predict(X_test)
  
# MSE and RMSE Computation
mse = MSE(y_test, pred)
r2 = R2(y_test, pred)
rmse = np.sqrt(mse)
print(f'r2: {r2}')
print("MSE: % f" %(mse))
print("RMSE: % f" %(rmse))

xgb_r.save_model("../models/deadlift.txt")

r2: 0.9104610193483963
MSE:  287.565841
RMSE:  16.957766


In [32]:
# We need to prepare data as DMatrix objects
train = xgb.DMatrix(X_train, y_train)
test = xgb.DMatrix(X_test, y_test)

# We need to define parameters as dict
params = {
    "learning_rate": 0.01,
    "max_depth": 3
}
# training, we set the early stopping rounds parameter
model_xgb = xgb.train(params, 
          train, evals=[(train, "train"), (test, "validation")], 
          num_boost_round=200, early_stopping_rounds=20)

[0]	train-rmse:193.68144	validation-rmse:193.32722
[1]	train-rmse:191.76684	validation-rmse:191.41487
[2]	train-rmse:189.87156	validation-rmse:189.52153
[3]	train-rmse:187.99548	validation-rmse:187.64752
[4]	train-rmse:186.13834	validation-rmse:185.79268
[5]	train-rmse:184.29997	validation-rmse:183.95588
[6]	train-rmse:182.48019	validation-rmse:182.13880
[7]	train-rmse:180.67873	validation-rmse:180.33951
[8]	train-rmse:178.89554	validation-rmse:178.55906
[9]	train-rmse:177.13032	validation-rmse:176.79594
[10]	train-rmse:175.38300	validation-rmse:175.05145
[11]	train-rmse:173.65331	validation-rmse:173.32382
[12]	train-rmse:171.94108	validation-rmse:171.61470
[13]	train-rmse:170.24605	validation-rmse:169.92193
[14]	train-rmse:168.56820	validation-rmse:168.24631
[15]	train-rmse:166.90729	validation-rmse:166.58840
[16]	train-rmse:165.26322	validation-rmse:164.94652
[17]	train-rmse:163.63565	validation-rmse:163.32192
[18]	train-rmse:162.02471	validation-rmse:161.71309
[19]	train-rmse:160.42

In [24]:
model_xgb.best_ntree_limit

500

In [25]:
model_xgb.predict(test)

array([256.57397 , 108.301476, 119.78945 , ..., 126.498505, 297.5051  ,
       165.24431 ], dtype=float32)

In [26]:
model_xgb.predict(test, ntree_limit=model_xgb.best_ntree_limit)

array([256.57397 , 108.301476, 119.78945 , ..., 126.498505, 297.5051  ,
       165.24431 ], dtype=float32)

In [None]:
# Test more XGBoost models, tune parameters

# Run GridSearchCV with param grid of n_estimators and other params