In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv('../data/bodyPerformance.csv')
print(df.head())

    age gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0      M      172.3      75.24        21.3       80.0     130.0   
1  25.0      M      165.0      55.80        15.7       77.0     126.0   
2  31.0      M      179.6      78.00        20.1       92.0     152.0   
3  32.0      M      174.5      71.10        18.4       76.0     147.0   
4  28.0      M      173.8      67.70        17.1       70.0     127.0   

   gripForce  sit and bend forward_cm  sit-ups counts  broad jump_cm class  
0       54.9                     18.4            60.0          217.0     C  
1       36.4                     16.3            53.0          229.0     A  
2       44.8                     12.0            49.0          181.0     C  
3       41.4                     15.2            53.0          219.0     B  
4       43.5                     27.1            45.0          217.0     B  


In [6]:
df['BMI'] = df['weight_kg'] / (df['height_cm']/100) ** 2
df['bodyFatRatio'] = df['body fat_%'] / df['BMI']
df['strength_ration'] = df['gripForce'] / df['weight_kg']
df['jump_ratio'] = df['broad jump_cm'] / df['height_cm']
df['situp_ratio'] = df['sit-ups counts'] / df['age']
df['flexibility_ratio'] = df['sit and bend forward_cm'] / df['height_cm']
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class,BMI,bodyFatRatio,strength_ration,jump_ratio,situp_ratio,flexibility_ratio
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C,25.344179,0.84043,0.729665,1.259431,2.222222,0.10679
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A,20.495868,0.766008,0.65233,1.387879,2.12,0.098788
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C,24.181428,0.831216,0.574359,1.007795,1.580645,0.066815
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B,23.349562,0.788023,0.582278,1.255014,1.65625,0.087106
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B,22.412439,0.762969,0.642541,1.248562,1.607143,0.155926


In [15]:
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['BMI'] = pd.to_numeric(df['BMI'], errors='coerce')
df = df.dropna().reset_index(drop=True)
cols_to_check = df.columns.drop(['gender', 'class'])
df = df[(df[cols_to_check] > 0).all(axis=1)].reset_index(drop=True)
df.head(10)

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class,BMI,bodyFatRatio,strength_ration,jump_ratio,situp_ratio,flexibility_ratio
0,27.0,1,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C,25.344179,0.84043,0.729665,1.259431,2.222222,0.10679
1,25.0,1,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A,20.495868,0.766008,0.65233,1.387879,2.12,0.098788
2,31.0,1,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C,24.181428,0.831216,0.574359,1.007795,1.580645,0.066815
3,32.0,1,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B,23.349562,0.788023,0.582278,1.255014,1.65625,0.087106
4,28.0,1,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B,22.412439,0.762969,0.642541,1.248562,1.607143,0.155926
5,36.0,0,165.4,55.4,22.0,64.0,119.0,23.8,21.0,27.0,153.0,B,20.25064,1.086385,0.429603,0.92503,0.75,0.126965
6,42.0,0,164.5,63.7,32.2,72.0,135.0,22.7,0.8,18.0,146.0,D,23.540063,1.367881,0.356358,0.887538,0.428571,0.004863
7,33.0,1,174.9,77.2,36.9,84.0,137.0,45.9,12.3,42.0,234.0,B,25.236997,1.462139,0.59456,1.337907,1.272727,0.070326
8,54.0,1,166.8,67.5,27.6,85.0,165.0,40.4,18.6,34.0,148.0,C,24.261167,1.13762,0.598519,0.88729,0.62963,0.111511
9,28.0,1,185.0,84.6,14.4,81.0,156.0,57.9,12.1,55.0,213.0,B,24.718773,0.582553,0.684397,1.151351,1.964286,0.065405


In [16]:
X = df.drop('class', axis=1)
y = df['class']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)