In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.datasets import load_digits
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import learning_curve
from sklearn import metrics

In [None]:
# Sampling 20,000 data entries
# n = 767672 
# s = 40000
# skip = sorted(random.sample(range(1,n+1),n-s))
data = pd.read_csv('data.csv')
data

In [None]:
# Dropping columns not needed for this test
data = data.drop(columns=['Name', 'AgeClass', 
                          'Squat4Kg', 'Bench4Kg', 'Deadlift4Kg',
                         'Place', 'Tested',
                          'Date', 'TotalKg', 'Squat1Kg',
                          'Squat2Kg', 'Squat3Kg','Bench1Kg', 'Bench2Kg',
                         'Bench3Kg', 'Deadlift1Kg', 'Deadlift2Kg',
                          'Deadlift3Kg', 'McCulloch', 'Glossbrenner',
                          'WeightClassKg'])

data.columns
data.isnull().sum()

In [None]:
# General dropna just  to clear those without a WILKS
data['Event'].unique()
data['Best3BenchKg'].dtype

# Dropping athletes who did not compete in complete SBD competitions
indexNames = data[data['Event'] == 'S'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'B'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'D'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'SD'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'SB'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'BD'].index
data.drop(indexNames, inplace=True)
data

In [None]:
# Selecting subsets of the main dataset to run regression depending on competitions.
data_s = data[data['Event'] == 'S'] # --> Squat subset
data_b = data[data['Event'] == 'B'] # --> Bench subset
data_d = data[data['Event'] == 'D'] # --> Deadlift subset
data_sd = data[data['Event'] == 'SD'] # --> Squat, Deadlift subset
data_sb = data[data['Event'] == 'SB'] # --> Squat, Bench subset
data_bd = data[data['Event'] == 'BD'] # --> Bench, Deadlift subset
data_sbd = data[data['Event'] == 'SBD'] # --> All lifts subset
print('Squats: ' + str(data_s['Sex'].count()))
print('Benchpres: ' + str(data_b['Sex'].count()))
print('Deadlifts: ' + str(data_d['Sex'].count()))
print('Squats and Deadlifts: ' + str(data_sd['Sex'].count()))
print('Squats and Benchpresses: ' + str(data_sb['Sex'].count()))
print('Benchpresses and Deadlifts: ' + str(data_bd['Sex'].count()))
print('All: ' + str(data_sbd['Sex'].count()))


# Benchpress Subset

In [None]:
data_b.drop(['Best3SquatKg', 'Best3DeadliftKg'], axis= 1, inplace= True)
data_b.dropna(subset=['Best3BenchKg'], inplace=True)
data_b.reset_index(drop=True, inplace=True)
data_b

In [None]:
# Convert string data to numeric
sex = {'M': 1, 'F': 0}
# Boolean for equipment
equipment = {'Raw': 0, 'Wraps': 1, 'Multi-ply': 1, 'Single-ply': 1, 'Straps': 1}

data_b.Sex = [sex[item] for item in data_b.Sex]
data_b.Equipment = [equipment[item] for item in data_b.Equipment]
data_b

In [None]:
# Normalise the data, scaling to values between 0-1
scaler = MinMaxScaler(feature_range=(0,1))
data_b[['Sex', 'Age', 'Equipment', 'BodyweightKg',
        'Wilks', 'Best3BenchKg']] = scaler.fit_transform(data_b[['Sex', 'Age', 'Equipment', 'BodyweightKg',
        'Wilks', 'Best3BenchKg']])

In [None]:
# So, running 3 different test and training sets.
# First for squats, then bench, finally deadlifts,
# See the model accuracy for predicting someone's lifts.
# Further experiments to see which variables are critical
# to accurate prediction. How accurate can we get with age,
# sex, weight, and equipment?

test_data_b = data_b[[ 'Sex', 'Age', 'BodyweightKg', 'Equipment', 'Wilks']]

target_bench_b = data_b[['Best3BenchKg']]

In [None]:
#Prepare the sets
X = test_data_b
b = target_bench_b
# b = target_bench
# d = target_dead

In [None]:
# Split sets into training and test sets
X_train, X_test, b_train, b_test = train_test_split(X, b, test_size=0.1, random_state=3)
# X_train, X_test, b_train, b_test = train_test_split(X, b, test_size=0.1, random_state=3)
# X_train, X_test, d_train, d_test = train_test_split(X, d, test_size=0.1, random_state=3)

In [None]:
# Prepare random forest models, check r2_scores.
brfregr = RandomForestRegressor(n_estimators=100, max_depth = 30)
# brfregr = RandomForestRegressor(n_estimators=100, max_depth = 30)
# drfregr = RandomForestRegressor(n_estimators=100, max_depth = 30)

In [None]:
brfregr.fit(X_train, b_train.values.ravel())

In [None]:
B_rfr = brfregr.predict(X_test)
print("R2-score: %.2f" % r2_score(B_rfr , b_test))
print("RMSE: %.2f" % np.sqrt(metrics.mean_squared_error(b_test, B_rfr)))
print("Std: %.2f" % b_test.std())

In [None]:
brfregr.fit(X_train, b_train.values.ravel())

In [None]:
B_rfr = brfregr.predict(X_test)
print("R2-score: %.2f" % r2_score(B_rfr , b_test))
print("RMSE: %.2f" % np.sqrt(metrics.mean_squared_error(b_test, B_rfr)))
print("Std: %.2f" % b_test.std())

In [None]:
brfregr.fit(X_train, b_train.values.ravel())

In [None]:
D_rfr = drfregr.predict(X_test)
print("R2-score: %.2f" % r2_score(D_rfr , d_test))
print("RMSE: %.2f" % np.sqrt(metrics.mean_squared_error(d_test, D_rfr)))
print("Std: %.2f" % d_test.std())

In [None]:
#Prepare Lineaar models and check r2_scores
S_linreg = LinearRegression()
B_linreg = LinearRegression()
D_linreg = LinearRegression()

In [None]:
S_linreg.fit(X_train, s_train.values.ravel())

In [None]:
S_lin = S_linreg.predict(X_test)
print("R2-score: %.2f" % r2_score(S_lin , s_test))
print("RMSE: %.2f" % np.sqrt(metrics.mean_squared_error(s_test, S_lin)))
print("Std: %.2f" % s_test.std())

In [None]:
B_linreg.fit(X_train, b_train.values.ravel())

In [None]:
B_lin = B_linreg.predict(X_test)
print("R2-score: %.2f" % r2_score(B_lin , b_test))
print("RMSE: %.2f" % np.sqrt(metrics.mean_squared_error(b_test, B_lin)))
print("Std: %.2f" % b_test.std())

In [None]:
D_linreg.fit(X_train, d_train.values.ravel())

In [None]:
D_lin = D_linreg.predict(X_test)
print("R2-score: %.2f" % r2_score(D_lin , d_test))
print("RMSE: %.2f" % np.sqrt(metrics.mean_squared_error(d_test, D_lin)))
print("Std: %.2f" % d_test.std())

In [None]:
sfr_pred = pd.DataFrame(srfregr.predict(X_test))

slr_pred = pd.DataFrame(S_linreg.predict(X_test))


S_bw = pd.DataFrame(X_test['BodyweightKg'])
S_bw = S_bw.reset_index()
S_bw = S_bw.drop(columns = 'index')

In [None]:
xfit = np.linspace(0, 1)
yfit = RandomForestRegressor().fit(S_bw, sfr_pred.values.ravel()).predict(xfit[:, None])
zfit = RandomForestRegressor().fit(S_bw, s_test.values.ravel()).predict(xfit[:, None])
plt.figure(figsize=(10,10))

plt.subplot(1 , 2, 1)
plt.scatter(S_bw, s_test, color = 'g', label='Actual Squats', alpha=0.2)
plt.plot(xfit, np.sin(zfit), color = 'b', label='Real');
plt.plot([0,0.8], [0,0.8], 'k--')
plt.legend()
plt.grid(True)

plt.subplot(1 , 2, 2)
plt.scatter(S_bw, s_test, color = 'g', label='Actual Squats', alpha=0.2)
plt.plot(xfit, np.sin(yfit), color = 'r', label='Pred');
plt.plot([0,0.8], [0,0.8], 'k--')
plt.legend()
plt.grid(True)

plt.show()

In [None]:
xfit = np.linspace(0, 0.8)
yfit = RandomForestRegressor().fit(S_bw, sfr_pred.values.ravel()).predict(xfit[:, None])
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
plt.errorbar(xfit, yfit, 0.1, fmt='.g')
plt.plot([0,0.8], [0,0.8], 'k--')
ax.grid(True)
plt.show()

In [None]:
xfit = np.linspace(0, 1)
yfit = LinearRegression().fit(S_bw, slr_pred).predict(xfit[:, None])
zfit = LinearRegression().fit(S_bw, s_test).predict(xfit[:, None])
plt.figure(figsize=(10,10))

plt.subplot(1 , 2, 1)
plt.scatter(S_bw, s_test, color = 'g', label='Actual Squats', alpha=0.2)
plt.plot(xfit, np.sin(zfit), color = 'b', label='Real');
plt.plot([0,0.8], [0,0.8], 'k--')
plt.legend()
plt.grid(True)

plt.subplot(1 , 2, 2)
plt.scatter(S_bw, s_test, color = 'g', label='Actual Squats', alpha=0.2)
plt.plot(xfit, np.sin(yfit), color = 'r', label='Pred');
plt.plot([0,0.8], [0,0.8], 'k--')
plt.legend()
plt.grid(True)

ax.grid(True)
plt.show()

In [None]:
xfit = np.linspace(0, 1)
yfit = LinearRegression().fit(S_bw, slr_pred).predict(xfit[:, None])
zfit = LinearRegression().fit(S_bw, s_test).predict(xfit[:, None])
plt.figure(figsize=(10,10))

plt.subplot(1 , 2, 1)
plt.scatter(S_bw, s_test, color = 'g', label='Actual Squats', alpha=0.2)
plt.plot([0,0.8], [0,1], 'k--')
plt.legend()
plt.grid(True)

plt.subplot(1 , 2, 2)
plt.scatter(S_bw, slr_pred, color = 'purple', label='Predicted Squats', alpha=0.2)
plt.plot([0,0.8], [0,1], 'k--')
plt.legend()
plt.grid(True)

ax.grid(True)
plt.show()

In [None]:
training_sizes = [1, 500, 2000, 5000, 7500, 10000, 15000, 19000]

train_sizes, train_scores, validation_scores = learning_curve(
estimator = LinearRegression(),
X = X,
y = s, train_sizes = training_sizes,cv = 100,
scoring = 'neg_mean_absolute_error')

train_scores_mean = -train_scores.mean(axis = 1)*1000
validation_scores_mean = -validation_scores.mean(axis = 1)*1000

In [None]:
plt.figure(figsize=(7.5,7.5))
plt.plot(train_sizes, train_scores_mean, 'b--', label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, 'r--', label = 'Test error')
plt.ylabel('MAE*1000', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for linear regression', fontsize = 12, y = 1.03)
plt.grid(True)
plt.legend()
plt.ylim(0, 50)
plt.xlim(0, 15000)