In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor

import numpy as np
import pandas as pd
import os

import _pickle as cPickle
import joblib

In [4]:
data = pd.read_csv('usapl_data.csv')

In [5]:
data

Unnamed: 0,Sex,Event,Equipment,Age,AgeClass,BirthYearClass,Division,BodyweightKg,WeightClassKg,Squat1Kg,...,Tested,Country,State,Federation,ParentFederation,Date,MeetCountry,MeetState,MeetTown,MeetName
0,M,SBD,Single-ply,27.0,24-34,24-39,M-O,92.7,93,,...,Yes,USA,,USAPL,IPF,2015-02-07,USA,WA,,Team Phoinix Qualifyer
1,M,SBD,Single-ply,33.5,24-34,24-39,M-O,118.0,120,,...,Yes,USA,,USAPL,IPF,2015-02-07,USA,WA,,Team Phoinix Qualifyer
2,M,SBD,Raw,24.5,24-34,24-39,MR-O,89.6,93,,...,Yes,,,USAPL,IPF,2015-02-07,USA,WA,,Team Phoinix Qualifyer
3,M,SBD,Single-ply,39.5,35-39,40-49,M-M1a,114.8,120,,...,Yes,USA,,USAPL,IPF,2015-02-07,USA,WA,,Team Phoinix Qualifyer
4,M,SBD,Raw,40.5,40-44,40-49,MR-M1a,129.7,120+,,...,Yes,USA,,USAPL,IPF,2015-02-07,USA,WA,,Team Phoinix Qualifyer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214194,M,SBD,Single-ply,18.0,18-19,,M-O,82.1,82.5,,...,Yes,USA,MS,USAPL,IPF,2008-08-02,USA,MS,Philadelphia,State Meet
214195,M,SBD,Single-ply,51.0,50-54,50-59,M-O,,125+,,...,Yes,USA,MS,USAPL,IPF,2008-08-02,USA,MS,Philadelphia,State Meet
214196,M,SBD,Single-ply,17.0,16-17,14-18,M-O,82.1,82.5,,...,Yes,USA,MS,USAPL,IPF,2008-08-02,USA,MS,Philadelphia,State Meet
214197,M,SBD,Single-ply,21.0,20-23,19-23,M-O,,125+,,...,Yes,USA,MS,USAPL,IPF,2008-08-02,USA,MS,Philadelphia,State Meet


In [6]:
data.dtypes

Sex                  object
Event                object
Equipment            object
Age                 float64
AgeClass             object
BirthYearClass       object
Division             object
BodyweightKg        float64
WeightClassKg        object
Squat1Kg            float64
Squat2Kg            float64
Squat3Kg            float64
Squat4Kg            float64
Best3SquatKg        float64
Bench1Kg            float64
Bench2Kg            float64
Bench3Kg            float64
Bench4Kg            float64
Best3BenchKg        float64
Deadlift1Kg         float64
Deadlift2Kg         float64
Deadlift3Kg         float64
Deadlift4Kg         float64
Best3DeadliftKg     float64
TotalKg             float64
Place                object
Dots                float64
Wilks               float64
Glossbrenner        float64
Goodlift            float64
Tested               object
Country              object
State                object
Federation           object
ParentFederation     object
Date                

In [7]:
data.isnull().sum()

Sex                      0
Event                    0
Equipment                0
Age                  29496
AgeClass             22925
BirthYearClass       25177
Division                 0
BodyweightKg          2640
WeightClassKg          775
Squat1Kg             69074
Squat2Kg             70052
Squat3Kg             72156
Squat4Kg            214199
Best3SquatKg         34381
Bench1Kg             54061
Bench2Kg             55099
Bench3Kg             57282
Bench4Kg            214156
Best3BenchKg         10547
Deadlift1Kg          65008
Deadlift2Kg          66857
Deadlift3Kg          69539
Deadlift4Kg         214198
Best3DeadliftKg      26065
TotalKg              11179
Place                    0
Dots                 13534
Wilks                13534
Glossbrenner         13534
Goodlift             22445
Tested                   0
Country              36738
State                75454
Federation               0
ParentFederation      4129
Date                     0
MeetCountry              0
M

In [12]:
for column in data.columns:
    if data[column].isnull().sum() > 30000:
        print(column)
#     print(f'{column} {data[column].isnull().sum()}')

Squat1Kg
Squat2Kg
Squat3Kg
Squat4Kg
Best3SquatKg
Bench1Kg
Bench2Kg
Bench3Kg
Bench4Kg
Deadlift1Kg
Deadlift2Kg
Deadlift3Kg
Deadlift4Kg
Country
State
MeetTown


In [14]:
raw_lifters = data[data['Equipment'] == 'Raw']

In [25]:
raw_lifters

Unnamed: 0,Sex,Event,Equipment,Age,AgeClass,BirthYearClass,Division,BodyweightKg,WeightClassKg,Squat1Kg,...,Goodlift,Tested,Country,State,Federation,ParentFederation,Date,MeetCountry,MeetState,MeetName
2,M,SBD,Raw,24.5,24-34,24-39,MR-O,89.6,93,,...,59.29,Yes,,,USAPL,IPF,2015-02-07,USA,WA,Team Phoinix Qualifyer
4,M,SBD,Raw,40.5,40-44,40-49,MR-M1a,129.7,120+,,...,78.44,Yes,USA,,USAPL,IPF,2015-02-07,USA,WA,Team Phoinix Qualifyer
5,M,SBD,Raw,46.5,45-49,40-49,MR-M1b,93.4,105,,...,80.61,Yes,USA,,USAPL,IPF,2015-02-07,USA,WA,Team Phoinix Qualifyer
6,F,SBD,Raw,21.5,20-23,19-23,FR-Jr,65.3,72,,...,53.48,Yes,,,USAPL,IPF,2015-02-07,USA,WA,Team Phoinix Qualifyer
8,F,SBD,Raw,40.5,40-44,40-49,FR-M1a,62.7,63,,...,58.71,Yes,,,USAPL,IPF,2015-02-07,USA,WA,Team Phoinix Qualifyer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213920,M,SBD,Raw,16.5,16-17,14-18,MR-T2,59.2,60,,...,66.23,Yes,USA,,USAPL,IPF,2012-01-01,USA,,7th Northeastern Regional Powerlifting Champio...
213922,M,SBD,Raw,19.5,20-23,19-23,MR-T3,87.0,90,,...,71.11,Yes,,,USAPL,IPF,2012-01-01,USA,,7th Northeastern Regional Powerlifting Champio...
213926,M,SBD,Raw,12.0,5-12,,MR-Y,51.0,52,,...,29.97,Yes,USA,,USAPL,IPF,2012-01-01,USA,,7th Northeastern Regional Powerlifting Champio...
213933,M,BD,Raw,50.5,50-54,50-59,MR-M2a,99.1,100,,...,,Yes,,,USAPL,IPF,2012-01-01,USA,,7th Northeastern Regional Powerlifting Champio...


In [26]:
raw_lifters.isnull().sum()

Sex                     0
Event                   0
Equipment               0
Age                  4003
AgeClass             3753
BirthYearClass       5901
Division                0
BodyweightKg          220
WeightClassKg         443
Squat1Kg            23617
Squat2Kg            24263
Squat3Kg            25812
Best3SquatKg        16762
Bench1Kg            13805
Bench2Kg            14415
Bench3Kg            16068
Best3BenchKg         5113
Deadlift1Kg         19802
Deadlift2Kg         21053
Deadlift3Kg         23027
Best3DeadliftKg     10904
TotalKg              5812
Place                   0
Dots                 5984
Wilks                5984
Glossbrenner         5984
Goodlift            11986
Tested                  0
Country              9153
State               22933
Federation              0
ParentFederation     3874
Date                    0
MeetCountry             0
MeetState             449
MeetName                0
dtype: int64

In [20]:
# Can drop these columns, too many nan values
for column in raw_lifters.columns:
    if raw_lifters[column].isnull().sum() > 30000:
        print(column)

Squat4Kg
Bench4Kg
Deadlift4Kg
MeetTown


In [24]:
raw_lifters.drop(['Squat4Kg', 'Bench4Kg', 'Deadlift4Kg', 'MeetTown'], axis=1, inplace=True)

KeyError: "['Squat4Kg' 'Bench4Kg' 'Deadlift4Kg' 'MeetTown'] not found in axis"

In [28]:
raw_lifters.dropna(axis=1)

Unnamed: 0,Sex,Event,Equipment,Division,Place,Tested,Federation,Date,MeetCountry,MeetName
2,M,SBD,Raw,MR-O,1,Yes,USAPL,2015-02-07,USA,Team Phoinix Qualifyer
4,M,SBD,Raw,MR-M1a,1,Yes,USAPL,2015-02-07,USA,Team Phoinix Qualifyer
5,M,SBD,Raw,MR-M1b,1,Yes,USAPL,2015-02-07,USA,Team Phoinix Qualifyer
6,F,SBD,Raw,FR-Jr,1,Yes,USAPL,2015-02-07,USA,Team Phoinix Qualifyer
8,F,SBD,Raw,FR-M1a,1,Yes,USAPL,2015-02-07,USA,Team Phoinix Qualifyer
...,...,...,...,...,...,...,...,...,...,...
213920,M,SBD,Raw,MR-T2,1,Yes,USAPL,2012-01-01,USA,7th Northeastern Regional Powerlifting Champio...
213922,M,SBD,Raw,MR-T3,1,Yes,USAPL,2012-01-01,USA,7th Northeastern Regional Powerlifting Champio...
213926,M,SBD,Raw,MR-Y,1,Yes,USAPL,2012-01-01,USA,7th Northeastern Regional Powerlifting Champio...
213933,M,BD,Raw,MR-M2a,DQ,Yes,USAPL,2012-01-01,USA,7th Northeastern Regional Powerlifting Champio...
