In [1]:
# import the libraries needed
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Preparing the train and test dataset
---
- We have obtained the dataset from https://www.openpowerlifting.org/
- From there, we randomised the data rows in the original CSV file and took out a total of 3,000 data points to be used for our train and test dataset

In [2]:
train_test_combined = pd.read_csv('../datasets/train_test_combined.csv')
train_test_combined.head()

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,BirthYearClass,Division,BodyweightKg,WeightClassKg,...,Tested,Country,State,Federation,ParentFederation,Date,MeetCountry,MeetState,MeetTown,MeetName
0,Alana Carrasco,F,SBD,Raw,29.5,24-34,24-39,FR-O,68.6,72,...,Yes,USA,AZ,USAPL,IPF,27/8/2017,USA,AZ,,Sun Devil Classic
1,Even Dysjaland,M,SBD,Raw,19.5,20-23,19-23,Juniors 19-23,94.65,105,...,Yes,,,NSF,IPF,13/6/2015,Norway,,Ganddal,Klubbstevne
2,Alessandro Favorito,M,SBD,Single-ply,39.0,35-39,24-39,Open,89.4,90,...,Yes,Italy,,FIPL,IPF,6/11/2004,Italy,,Marina Di Carrara,Coppa Italia A Squadre
3,Sydney Martinez,F,SBD,Single-ply,,,,Girls,55.16,56,...,Yes,USA,,THSWPA,,25/1/2014,USA,TX,JUNIOR HIGH,CARRIZO SPRINGS INVITATIONAL
4,Navy Villar,F,SBD,Single-ply,,,,Girls,55.61,56,...,Yes,,,THSWPA,,18/1/2018,USA,TX,Sadler,S&S Lady Rams Meet


In [8]:
# Split combined dataset into train and test
from sklearn.model_selection import train_test_split

train,test = train_test_split(train_test_combined, test_size = 0.25, random_state = 0)
train.to_csv('../datasets/train.csv', index = False)
test.to_csv('../datasets/test.csv', index = False)

print(train.shape)
print(test.shape)

(3000, 41)
(1000, 41)


In [9]:
# Import train dataset into a dataframe
train_df = pd.read_csv('../datasets/train.csv')
train_df.head()

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,BirthYearClass,Division,BodyweightKg,WeightClassKg,...,Tested,Country,State,Federation,ParentFederation,Date,MeetCountry,MeetState,MeetTown,MeetName
0,Hanne Bingle,F,SBD,Wraps,61.0,60-64,60-69,M5,73.7,75,...,,UK,,GPC,GPC,1/9/2020,Slovakia,,Trnava,European Championships
1,Mohamed Reda,M,SBD,Raw,,,,Open,99.58,105,...,Yes,Egypt,,AfricanPF,IPF,20/10/2018,Morocco,,Meknes,African & Arab Powerlifting Championships
2,Pauline Reeves,F,SBD,Single-ply,,45-49,,Open,129.3,90+,...,Yes,England,,BAWLA,IPF,26/4/2003,UK,,Livingstone,British Masters' Championships
3,Edie Montalvo,F,SBD,Single-ply,,,,Girls,71.12,75,...,Yes,,,THSWPA,,10/2/2018,USA,TX,Weslaco,Weslaco Girls Invitational Powerlifting Meet
4,Faith Bailey,F,SBD,Single-ply,,,,Girls,78.83,82.3,...,Yes,,,THSWPA,,7/3/2020,USA,TX,Bay City,Region 4 Division 3 Meet


## Data Preprocessing
---

### Removing disqualified lifters

First, find out if any lifters were disqualified (DQ) or did not turn up for the competition (NS). Proceed to remove them from our dataset.

In [13]:
train_df['Place'].unique()

array(['1', '3', 'DQ', '7', '9', '2', '4', '6', 'NS', '5', '14', '11',
       '10', '8', '13', '12', '18', '15', '22', '24', '32', '26', '17',
       '19', '16', 'G', '28', 'DD', '23', '51', '21', '84', '20', '29'],
      dtype=object)

In [14]:
df = train_df.drop(train_df[
    (train_df.Place == 'DQ') |
    (train_df.Place == 'NS') |
    (train_df.Place == 'DD')
].index)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2814 entries, 0 to 2999
Data columns (total 41 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              2814 non-null   object 
 1   Sex               2814 non-null   object 
 2   Event             2814 non-null   object 
 3   Equipment         2814 non-null   object 
 4   Age               1613 non-null   float64
 5   AgeClass          1884 non-null   object 
 6   BirthYearClass    1721 non-null   object 
 7   Division          2812 non-null   object 
 8   BodyweightKg      2775 non-null   float64
 9   WeightClassKg     2788 non-null   object 
 10  Squat1Kg          1284 non-null   float64
 11  Squat2Kg          1270 non-null   float64
 12  Squat3Kg          1230 non-null   float64
 13  Squat4Kg          11 non-null     float64
 14  Best3SquatKg      2787 non-null   float64
 15  Bench1Kg          1285 non-null   float64
 16  Bench2Kg          1277 non-null   float64


After dropping disqualified and no-show lifters from the dataset, we are left with only 2814 values from the original 3000 values. 

## Filling Missing Values (Age)
First, use the AgeClass column to fill in missing values for Age column. We will assume that the lifter's age is the maximum age within the AgeClass.

In [7]:
df[df['Age'].isna() & df['AgeClass'].notna()]

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,BirthYearClass,Division,BodyweightKg,WeightClassKg,...,Tested,Country,State,Federation,ParentFederation,Date,MeetCountry,MeetState,MeetTown,MeetName
21,Melissa Salinas,F,SBD,Single-ply,,16-17,14-18,Girls,73.39,75,...,Yes,,,THSWPA,,17/1/2015,USA,TX,Skidmore Texas,2nd Annual Skidmore Invitational
26,Amy Hoffman,F,SBD,Single-ply,,45-49,40-49,Masters 45-49,75.00,75,...,,,,USPF,,5/12/2015,USA,WV,Parkersburg,Region 2 Championships
33,Nick Cooper,M,SBD,Single-ply,,13-15,14-18,M-T1,75.00,75,...,Yes,USA,,USAPL,IPF,23/3/2002,USA,FL,Lakeland,Frank Kostyo Memorial
40,Atle Andersen,M,SBD,Single-ply,,24-34,24-39,Open,74.60,75,...,Yes,,,NSF,IPF,24/2/1991,Norway,,Lykkeberghallen,Seriestevne
56,Diane Farrar,F,SBD,Single-ply,,45-49,40-49,F-M1b,60.00,60,...,Yes,,,USAPL,IPF,15/5/2004,USA,RI,Warwick,New England States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3928,J. Shelton,M,SBD,Single-ply,,16-17,14-18,Teen 14-16,67.50,67.5,...,Yes,Canada,,ADFPA,,23/3/1990,USA,MS,Jackson,Mississippi Teenage
3943,Sanjuana Delgado,F,SBD,Single-ply,,18-19,,Girls,55.97,56,...,Yes,,,THSWPA,,5/3/2016,USA,TX,Academy,Region 2 Division 3
3945,Ramirez Jr,M,SBD,Single-ply,,40-44,40-49,Masters 40-44,90.00,90,...,,,,USPF,IPF,14/5/1993,USA,TX,Irving,Masters Nationals
3965,Kayla Anderson #3,F,SBD,Single-ply,,18-19,,Girls,93.71,100,...,Yes,,,THSWPA,,24/1/2019,USA,TX,Henderson,Lion Invitational


In [179]:
def newString(g):
    ## Function that returns the last two characters of a str as an int. 
    if type(g) == str:
        return int(g[-2:])
    return g

In [180]:
## First, fill the values using the AgeClass Strings
df['Age'].fillna(df.AgeClass, inplace=True)

In [181]:
## Use the newString function to replace the Strings with the maximum age 
df['Age'] = df['Age'].apply(newString)

In [183]:
#compare indexes against the DataFrame printed earlier to ensure the replacement was done correctly
df.loc[29]

Name                       Ashley Gilbert
Sex                                     F
Event                                 SBD
Equipment                      Single-ply
Age                                  19.0
AgeClass                            18-19
BirthYearClass                        NaN
Division                            Girls
BodyweightKg                        79.56
WeightClassKg                        82.3
Squat1Kg                              NaN
Squat2Kg                              NaN
Squat3Kg                              NaN
Squat4Kg                              NaN
Best3SquatKg                       102.06
Bench1Kg                              NaN
Bench2Kg                              NaN
Bench3Kg                              NaN
Bench4Kg                              NaN
Best3BenchKg                        72.57
Deadlift1Kg                           NaN
Deadlift2Kg                           NaN
Deadlift3Kg                           NaN
Deadlift4Kg                       

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2828 entries, 0 to 2999
Data columns (total 41 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              2828 non-null   object 
 1   Sex               2828 non-null   object 
 2   Event             2828 non-null   object 
 3   Equipment         2828 non-null   object 
 4   Age               2228 non-null   float64
 5   AgeClass          2228 non-null   object 
 6   BirthYearClass    2118 non-null   object 
 7   Division          2828 non-null   object 
 8   BodyweightKg      2822 non-null   float64
 9   WeightClassKg     2813 non-null   object 
 10  Squat1Kg          1085 non-null   float64
 11  Squat2Kg          1077 non-null   float64
 12  Squat3Kg          1054 non-null   float64
 13  Squat4Kg          5 non-null      float64
 14  Best3SquatKg      1871 non-null   float64
 15  Bench1Kg          1712 non-null   float64
 16  Bench2Kg          1703 non-null   float64


Now the age column only contains numeric float64 datatypes, which shows that we have successfully replaced all the strings after the fillna() operation.

However, there are still some missing values, where both Age and AgeClass data was missing.

We can fill the remaining missing values for Age with the median age of their respective divisions.

In [184]:
df[df['Age'].isna()]

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,BirthYearClass,Division,BodyweightKg,WeightClassKg,...,Tested,Country,State,Federation,ParentFederation,Date,MeetCountry,MeetState,MeetTown,MeetName
2,Paige Shuemake,F,SBD,Single-ply,,,,Girls,54.25,56,...,Yes,,,THSWPA,,31/1/2015,USA,TX,Malakoff,Malakoff
3,Pål Nilsen,M,SBD,Single-ply,,,,Open,89.20,90,...,Yes,Norway,,NSF,IPF,6/6/1998,Norway,,Oslo,KM Oslo-Akershus
8,Tori Rash,F,SBD,Single-ply,,,,Girls,59.96,60.1,...,Yes,,,THSWPA,,24/1/2019,USA,TX,Needville,Needville 1-24-19
9,Stina Christiane Valmestad,F,B,Raw,,,,Open,73.23,84,...,Yes,,,NSF,IPF,15/12/2018,Norway,,Sande,Klubbstevne
10,Kyle Monhollen,M,SBD,Raw,,,,MR-O,104.20,105,...,Yes,,,USAPL,IPF,22/8/2015,USA,OH,,Future Summer Shakedown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2980,Alejandra Fierro,F,SBD,Single-ply,,,,Girls,51.94,51.9,...,Yes,,,THSWPA,,7/2/2019,USA,TX,Plainview,Plainview Triangular
2986,Tor Erik Rødsdalen,M,SBD,Single-ply,,,,Open,89.40,90,...,Yes,,,NSF,IPF,16/2/2002,Norway,,Brumunddal,B-og Veteran NM
2995,Roberto Bettati,M,SBD,Single-ply,,,40-49,Masters 1,88.40,90,...,Yes,Italy,,FIPL,IPF,29/4/2006,Italy,,Nettuno,Campionato Italiano Assoluto
2996,Lachlan Hodgetts,M,B,Raw,,,,Open,72.70,74,...,Yes,Canada,BC,CPU,IPF,23/6/2018,Canada,BC,Surrey,BCPA Provincial Championship


In [217]:
t = df.groupby('Division')['Age'].median()
t ## Series showing median age of each division

Division
13-14               13.5
17-19               18.0
40-49               45.0
45-49               48.0
55-59               57.0
                    ... 
Youth 10-15         13.5
Youth 11 & Under     6.0
Youth 11-12         11.0
sen                  NaN
Юниорки              NaN
Name: Age, Length: 369, dtype: float64

In [216]:
df['Age'] = df['Age'].fillna(df.groupby('Division')['Age'].transform('median'))

##stackoverflow magic https://stackoverflow.com/questions/19966018/pandas-filling-missing-values-by-mean-in-each-group


In [221]:
## cell for checking
print(df.loc[8]['Age']) ## value in the Age column after the fillna()
print(t[df.loc[8]['Division']]) ## median value of the division

19.0
19.0


In [222]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2828 entries, 0 to 2999
Data columns (total 41 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              2828 non-null   object 
 1   Sex               2828 non-null   object 
 2   Event             2828 non-null   object 
 3   Equipment         2828 non-null   object 
 4   Age               2793 non-null   float64
 5   AgeClass          2228 non-null   object 
 6   BirthYearClass    2118 non-null   object 
 7   Division          2828 non-null   object 
 8   BodyweightKg      2822 non-null   float64
 9   WeightClassKg     2813 non-null   object 
 10  Squat1Kg          1085 non-null   float64
 11  Squat2Kg          1077 non-null   float64
 12  Squat3Kg          1054 non-null   float64
 13  Squat4Kg          5 non-null      float64
 14  Best3SquatKg      1871 non-null   float64
 15  Bench1Kg          1712 non-null   float64
 16  Bench2Kg          1703 non-null   float64


Drop the remaining values NA values for age

In [227]:
df.dropna(subset=['Age'], inplace=True)

In [229]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2793 entries, 0 to 2999
Data columns (total 41 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              2793 non-null   object 
 1   Sex               2793 non-null   object 
 2   Event             2793 non-null   object 
 3   Equipment         2793 non-null   object 
 4   Age               2793 non-null   float64
 5   AgeClass          2228 non-null   object 
 6   BirthYearClass    2114 non-null   object 
 7   Division          2793 non-null   object 
 8   BodyweightKg      2790 non-null   float64
 9   WeightClassKg     2780 non-null   object 
 10  Squat1Kg          1074 non-null   float64
 11  Squat2Kg          1066 non-null   float64
 12  Squat3Kg          1044 non-null   float64
 13  Squat4Kg          4 non-null      float64
 14  Best3SquatKg      1850 non-null   float64
 15  Bench1Kg          1696 non-null   float64
 16  Bench2Kg          1687 non-null   float64


# Filling Missing Values (Bodyweight)

In [231]:
df[df.BodyweightKg.isna()]

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,BirthYearClass,Division,BodyweightKg,WeightClassKg,...,Tested,Country,State,Federation,ParentFederation,Date,MeetCountry,MeetState,MeetTown,MeetName
1086,Richard Nowazek,M,SBD,Single-ply,51.5,50-54,50-59,Open,,140+,...,Yes,Canada,BC,CPU,IPF,6/4/2002,Canada,BC,Vancouver,BC Provincial Championships
1597,Madison Neesmith,F,B,Raw,20.0,20-23,19-23,JR,,82.5+,...,Yes,USA,NC,WNPF,,6/10/2018,USA,,,World Tournament Of Championships
1605,Jay Bakke,M,SBD,Multi-ply,53.0,50-54,50-59,M_MEM_3_AAPF,,,...,Yes,USA,,APF,WPC,17/7/2010,USA,MT,,Big Sky State Games


We can use the weightclasses to fill in the first 2 missing bodyweight values. We assume they hold the minimum weight in the particular weightclass

Weight classes can be specified as a maximum or as a minimum. Maximums are specified by just the number, for example `90` means "up to (and including) 90kg." minimums are specified by a `+` to the right of the number, for example `90+` means "above (and excluding) 90kg.

In [238]:
df['BodyweightKg'].fillna(df.WeightClassKg, inplace=True)

In [241]:
df['BodyweightKg'] = df['BodyweightKg'].apply(lambda f: float(f[:-1]) if type(f) == str else f)

In [243]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2793 entries, 0 to 2999
Data columns (total 41 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              2793 non-null   object 
 1   Sex               2793 non-null   object 
 2   Event             2793 non-null   object 
 3   Equipment         2793 non-null   object 
 4   Age               2793 non-null   float64
 5   AgeClass          2228 non-null   object 
 6   BirthYearClass    2114 non-null   object 
 7   Division          2793 non-null   object 
 8   BodyweightKg      2792 non-null   float64
 9   WeightClassKg     2780 non-null   object 
 10  Squat1Kg          1074 non-null   float64
 11  Squat2Kg          1066 non-null   float64
 12  Squat3Kg          1044 non-null   float64
 13  Squat4Kg          4 non-null      float64
 14  Best3SquatKg      1850 non-null   float64
 15  Bench1Kg          1696 non-null   float64
 16  Bench2Kg          1687 non-null   float64


The final missing bodyweight value belongs to a Male in the 50-54 AgeClass. We can use the median value of that AgeClass to fill the remaining missing value.

In [246]:
df.groupby(['AgeClass']).median()

Unnamed: 0_level_0,Age,BodyweightKg,Squat1Kg,Squat2Kg,Squat3Kg,Squat4Kg,Best3SquatKg,Bench1Kg,Bench2Kg,Bench3Kg,...,Deadlift1Kg,Deadlift2Kg,Deadlift3Kg,Deadlift4Kg,Best3DeadliftKg,TotalKg,Dots,Wilks,Glossbrenner,Goodlift
AgeClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13-15,14.5,58.7,98.75,96.5,83.75,65.0,104.33,52.5,57.5,37.5,...,108.75,118.75,101.05,-105.0,115.0,220.0,211.27,213.22,208.21,47.03
16-17,16.5,70.22,115.0,122.5,96.5,,126.005,70.0,62.5,-35.875,...,128.75,138.75,130.0,-226.25,137.5,300.0,274.87,273.4,256.49,53.13
18-19,18.5,73.9,137.5,147.5,112.5,,142.5,87.5,90.0,-60.0,...,170.0,173.75,125.0,-125.0,154.22,331.12,306.72,302.64,280.99,59.31
20-23,21.5,81.2,170.0,170.0,118.75,165.0,186.25,105.0,107.5,56.25,...,185.0,190.0,130.0,327.5,212.5,422.5,337.77,334.795,315.67,71.55
24-34,28.0,83.0,147.5,157.5,127.5,241.0,182.5,120.0,115.0,-55.0,...,190.0,190.0,126.25,125.0,210.0,320.0,299.4,298.43,270.62,73.91
35-39,37.0,86.91,140.0,117.5,117.5,,193.75,119.0,90.0,-52.5,...,177.5,156.25,142.5,137.5,210.0,267.5,181.2,181.59,173.64,71.165
40-44,42.0,91.9,165.0,158.76,117.5,,190.0,132.5,117.5,-67.5,...,200.0,203.75,120.0,,218.75,240.0,151.9,154.03,146.39,70.12
45-49,47.0,89.05,123.75,151.25,132.5,,182.5,130.0,105.0,-50.0,...,150.0,160.0,157.5,188.0,192.64,225.0,160.365,158.555,142.755,67.21
5-Dec,11.0,39.9,30.0,45.0,-38.75,,25.0,23.75,20.0,-22.5,...,42.215,63.5,-40.0,-86.18,35.0,35.0,44.49,46.74,46.35,21.615
50-54,52.0,82.45,150.0,120.0,82.5,,183.75,105.0,96.25,-57.5,...,185.0,200.0,133.27,,205.0,231.33,152.25,151.07,144.31,64.17


In [247]:
df.BodyweightKg.fillna(82.45, inplace=True)