# Modeling for Powerlifting Dataset

The cells below read in the data and import the libraries to assist with the analysis.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectFromModel

In [2]:
data1 = './data_powerlifting/openpowerlifting.csv'
data2 = './data_powerlifting/meets.csv'
powerlift = pd.read_table(data1, sep=',')
meets = pd.read_table(data2, sep=',')
powerlift_meets = pd.merge(powerlift, meets)
power = powerlift_meets

The cell below repaces the names of the columns to be lowercase for easier coding during the analysis

In [3]:
powerlift.rename(str.lower, axis='columns', inplace = True)

In [4]:
powerlift.drop(['squat4kg', 'bench4kg', 'deadlift4kg','meetid','name'], axis=1, inplace=True)
powerlift.dropna(inplace=True)
powerlift.shape

(96753, 12)

**Removed all rows with missing data to clean up dataset**

In [5]:
powerlift.head()

Unnamed: 0,sex,equipment,age,division,bodyweightkg,weightclasskg,bestsquatkg,bestbenchkg,bestdeadliftkg,totalkg,place,wilks
0,F,Wraps,47.0,Mst 45-49,59.6,60.0,47.63,20.41,70.31,138.35,1,155.05
1,F,Single-ply,42.0,Mst 40-44,58.51,60.0,142.88,95.25,163.29,401.42,1,456.38
2,F,Single-ply,42.0,Open Senior,58.51,60.0,142.88,95.25,163.29,401.42,1,456.38
5,F,Wraps,28.0,Open Senior,62.41,67.5,170.1,77.11,145.15,392.36,1,424.4
6,F,Raw,60.0,Mst 60-64,67.31,67.5,124.74,95.25,163.29,383.28,1,391.98


#### The dataset contains male and female data.  I will split these groups apart to mimic competition.  The data will also only consider adults ages 18 and older.

In [6]:
female_lifters = powerlift.loc[powerlift['sex'] == 'F']
female_lifters = female_lifters[female_lifters['age'] >=18]
male_lifters = powerlift.loc[powerlift['sex'] == 'M']
male_lifters = male_lifters[male_lifters['age'] >=18]

##### There are many unique divisions with only one data point so removed single data point divisions. There is nothing interesting to learn against a person competing only against themselves.

In [7]:
counts_male = male_lifters['division'].value_counts()
counts_male[counts_male > 1]
male_lifters = male_lifters[male_lifters['division'].isin(counts_male[counts_male > 500].index)]

In [8]:
counts_female = female_lifters['division'].value_counts()
counts_female[counts_female > 1]
female_lifters = female_lifters[female_lifters['division'].isin(counts_female[counts_female > 500].index)]

In [9]:
print(female_lifters.division.unique())
print(female_lifters.division.shape)
print(male_lifters.division.unique())
print(male_lifters.division.shape)

['Open' 'Juniors' 'Open Women' 'Junior 20-23' 'Master 40-44' 'R-O' 'R-JR']
(16594,)
['Open' 'Junior' 'Submaster' 'M-O' 'M-OR' 'Juniors' 'Junior Men 20-23'
 'Open Men' 'Junior 20-23' 'Submaster 35-39' 'Junior 18-19' 'Master 45-49'
 'Master 40-44' 'MOR' 'R-O' 'R-JR' 'R-C']
(41781,)


# Modeling

In [10]:
powerlift.head(1)

Unnamed: 0,sex,equipment,age,division,bodyweightkg,weightclasskg,bestsquatkg,bestbenchkg,bestdeadliftkg,totalkg,place,wilks
0,F,Wraps,47.0,Mst 45-49,59.6,60,47.63,20.41,70.31,138.35,1,155.05


In [11]:
fd1 = pd.get_dummies(female_lifters.equipment, prefix='equip', drop_first=True)
fd2 = pd.get_dummies(female_lifters.division, prefix='div', drop_first=True)
fd3 = pd.get_dummies(female_lifters.weightclasskg, prefix='weightclass', drop_first=True)

### Created dummified data to perform analysis and gain insights into categorical or class data and concatanate it into a single dataframe.

- **Equipment**: denoting if you use wrist wraps or other types of supports are examples as well as raw, which is no supports.
- **Weight Class**: Although this is numberical, the scale is not always the same and there is no limit at the top of a class
- **Division**: These are another classification of a powerlifter, typically they denote the qualification such as pro or amatuer and can also limit ages

In [12]:
f_dummy = pd.concat([female_lifters, fd1,fd2,fd3], axis=1)

**The following piece of code allows me to generate the list of columns to easily copy features to use in logisitcal regression analysis**

In [13]:
list(f_dummy);

# 1. Logistical Regression to Determine Weight Class

The modeling in this section will first look a logistic regression to determine an athelet's weightclass.  Although this is not extremely interesting, it demonstrates the ability to classify data.

## Feature Selection
After playing around with various features I decided to use all features exlcuding an athlete's weight, which for obvious reasons is perfectly correleated with weightclass.

In [14]:
f_feature = [
 'age',
 'bestsquatkg',
 'bestbenchkg',
 'bestdeadliftkg',
 'totalkg',
 'wilks',
 'equip_Raw',
 'equip_Single-ply',
 'equip_Wraps',
 'div_Juniors',
 'div_Master 40-44',
 'div_Open',
 'div_Open Women',
 'div_R-JR',
 'div_R-O',
]
Xf = f_dummy[f_feature]
yf = f_dummy.weightclasskg

### Once features selected, I fit a multinomial logistic regression

In [15]:
kf = model_selection.KFold(n_splits=5, shuffle=True)
f_LR = LogisticRegression(multi_class='multinomial', solver='sag')

scores = []

for train_index, test_index in kf.split(Xf, yf):
    f_LR = LogisticRegression().fit(Xf.iloc[train_index], yf.iloc[train_index])
    scores.append(f_LR.score(Xf, yf))

print(f'Mean of Accuracy for all folds: {np.mean(scores)}')

Mean of Accuracy for all folds: 0.45492346631312525


In [16]:
print('Baseline:',(2349/16594)*100)

Baseline: 14.15571893455466


**The model returned an accuracy of approximately 45%**

The baseline model would return an accuracy of ~14% making the logistic regression much more accurate.

# 2. Linear Regression Model
The linear regression is a model that could actually be used to assist an athlete in competition.  Given the rules of competition, an athlete's coach could use data of other competitors to determine how much weight you would want to attempt. 

During a powerlifting competition, you may not always be feeling your best and have the ability the achieve your training max.  With predictive insights you could potentially scale down or scale up to achieve a max if necessary giving an edge during the meet.

The typical competition allows you attempt each lift three times.  The data in this set limited me to only the bestlift of all three lifts.  I would foresee an athelete being able to enter each lift to increasingly tune into the required lift needed to win a competition.

**The first step is to make a new dataframe of only one weightclass.  Athletes only compete against others in their own class, so data from other classes is void for this analysis.**

- For this analysis, I will use 67.5kg weightclass since it had the most data points.

In [17]:
female_67raw = female_lifters
female_67raw = female_67raw[female_67raw.place != 'G']
female_67raw.place = female_67raw.place.astype(float)
female_67raw = female_67raw[female_67raw['weightclasskg'] == '67.5']
female_67raw = female_67raw[female_67raw['equipment'] == 'Raw']
print(female_67raw.shape)

(1849, 12)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [18]:
female_67raw.sample(n=3)

Unnamed: 0,sex,equipment,age,division,bodyweightkg,weightclasskg,bestsquatkg,bestbenchkg,bestdeadliftkg,totalkg,place,wilks
369863,F,Raw,33.0,Open,65.6,67.5,97.5,50.0,160.0,307.5,3.0,320.42
347327,F,Raw,51.0,Open,65.3,67.5,122.5,80.0,130.0,332.5,1.0,347.641
369299,F,Raw,40.0,Master 40-44,63.0,67.5,75.0,35.0,87.5,197.5,1.0,212.11


**Model is fitted using 5 K-folds**

## Break

In [19]:
f_lr_feat1 = [
    'bestsquatkg',
    'wilks',
    'age'
]

X1 = female_67raw[f_lr_feat1]
y1 = female_67raw.totalkg

kf1 = model_selection.KFold(n_splits=5, shuffle=True)

mse_values1 = []
scores1 = []

for train_index1, test_index1 in kf1.split(X1, y1):
    lr1 = LinearRegression().fit(X1.iloc[train_index1], y1.iloc[train_index1])
    mse_values1.append(metrics.mean_squared_error(y1.iloc[test_index1], lr1.predict(X1.iloc[test_index1])))
    scores1.append(lr1.score(X1, y1))

print('~~Linear Regression~~')    
print('Mean of MSE for all folds: {}'.format(np.mean(mse_values1)))
print('Mean of R2 for all folds: {}'.format(np.mean(scores1)))

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
# linreg = LinearRegression()
# linreg.fit(X_train, y_train)
# y_pred = linreg.predict(X_test)
# np.sqrt(metrics.mean_squared_error(y_test, y_pred))

y1_pred = lr1.predict(X1)
print('Baseline:',metrics.mean_squared_error(y1, y1_pred))

~~Linear Regression~~
Mean of MSE for all folds: 36.02518892788538
Mean of R2 for all folds: 0.9888580535791716
Baseline: 35.78879261733389


**Random forest score**

In [20]:
#Random Forest Model
rfreg1 = RandomForestRegressor(n_estimators=150,
                              max_features=3,
                              oob_score=True)
rfreg1.fit(X1, y1);

#Random Forest Scoring Parameters
kf_1rf = KFold(n_splits=10, shuffle=True)
scores_1rf = cross_val_score(rfreg1, X1, y1, cv=kf, scoring='neg_mean_squared_error')
print('MSE Random Forest:',np.mean(np.sqrt(-scores_1rf)))
print('R^2 Random Forest:',(rfreg1.oob_score_))

#train test split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1)
rfreg1 = RandomForestRegressor(n_estimators=150, max_features=3, oob_score=True)
rfreg1.fit(X1_train, y1_train)

# Set a threshold for which features to include.
print(SelectFromModel(rfreg1, threshold='mean', prefit=True).transform(X1_train).shape)
print(SelectFromModel(rfreg1, threshold='median', prefit=True).transform(X1_train).shape)

# Create a new feature matrix that only includes important features.
X1_important =  SelectFromModel(rfreg1, threshold='mean', prefit=True).transform(X1_test)

# Check the RMSE for a random forest that only includes important features.
rfreg1 = RandomForestRegressor(n_estimators=150, max_features=1)
scores_rfreg1 = cross_val_score(rfreg1, X1_important, y1_test, cv=10, scoring='neg_mean_squared_error')
print('Important Features MSE:',np.mean(np.sqrt(-scores_rfreg1)))

MSE Random Forest: 6.554482242099394
R^2 Random Forest: 0.9870717134969316
(1386, 1)
(1386, 2)
Important Features MSE: 7.956978205249767


## Break

In [21]:
lr2 = LinearRegression()
f_lr_feat2 = [
    'bestsquatkg',
    'wilks',
    'age',
    'bestbenchkg'
]

X2 = female_67raw[f_lr_feat2]
y2 = female_67raw.totalkg

kf2 = model_selection.KFold(n_splits=5, shuffle=True)

mse_values2 = []
scores2 = []

for train_index2, test_index2 in kf2.split(X2, y2):
    lr2 = LinearRegression().fit(X2.iloc[train_index2], y2.iloc[train_index2])
    mse_values2.append(metrics.mean_squared_error(y2.iloc[test_index2], lr2.predict(X2.iloc[test_index2])))
    scores2.append(lr2.score(X2, y2))
    
print('~~Linear Regression~~')    
print('Mean of MSE for all folds: {}'.format(np.mean(mse_values2)))
print('Mean of R2 for all folds: {}'.format(np.mean(scores2)))

y2_pred = lr2.predict(X2)
print('Baseline:',metrics.mean_squared_error(y2, y2_pred))

~~Linear Regression~~
Mean of MSE for all folds: 33.7352354706497
Mean of R2 for all folds: 0.9895480511650809
Baseline: 33.581563722097634


In [22]:
print('~~Random Forest~~')
#Random Forest Model
rfreg2 = RandomForestRegressor(n_estimators=150,
                              max_features=4,
                              oob_score=True)
rfreg2.fit(X2, y2);

#Random Forest Scoring Parameters
kf_2rf = KFold(n_splits=10, shuffle=True)
scores_2rf = cross_val_score(rfreg2, X2, y2, cv=kf, scoring='neg_mean_squared_error')
print('MSE Random Forest:',np.mean(np.sqrt(-scores_2rf)))
print('R^2 Random Forest:',(rfreg2.oob_score_))

#train test split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2)
rfreg2 = RandomForestRegressor(n_estimators=150, max_features=4, oob_score=True)
rfreg2.fit(X2_train, y2_train)

# Set a threshold for which features to include.
print(SelectFromModel(rfreg2, threshold='mean', prefit=True).transform(X2_train).shape)
print(SelectFromModel(rfreg2, threshold='median', prefit=True).transform(X2_train).shape)

# Create a new feature matrix that only includes important features.
X2_important = SelectFromModel(rfreg2, threshold='mean', prefit=True).transform(X2_test)

# Check the RMSE for a random forest that only includes important features.
rfreg2 = RandomForestRegressor(n_estimators=150, max_features=1)
scores_rfreg2 = cross_val_score(rfreg2, X2_important, y2_test, cv=10, scoring='neg_mean_squared_error')
print('Important Features MSE:',np.mean(np.sqrt(-scores_rfreg2)))

~~Random Forest~~
MSE Random Forest: 6.371469254145849
R^2 Random Forest: 0.9878553000990234
(1386, 1)
(1386, 2)
Important Features MSE: 8.417412252903407
