In [1]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load data 
df = pd.read_csv('Resources/bodyPerformance.csv')
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [3]:
# Restructure class column into 0 and 1
df['class'] = df['class'].replace({'A':0, 'B':0, 'C':1 , 'D':1})
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,1
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,0
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,1
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,0
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,0


In [4]:
# Define features
X = df.drop(columns=['class'])
X = pd.get_dummies(X)
X.head()

Unnamed: 0,age,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,gender_F,gender_M
0,27.0,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,False,True
1,25.0,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,False,True
2,31.0,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,False,True
3,32.0,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,False,True
4,28.0,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,False,True


In [5]:
# Define target
y = df['class'].values.reshape(-1,1)
y[:5]

array([[1],
       [0],
       [1],
       [0],
       [0]])

In [6]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [7]:
# Create the StandardScaler instance
scaler = StandardScaler()


In [8]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [9]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [10]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [11]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  rf_model = rf_model.fit(X_train_scaled, y_train)


In [12]:
# Get the feature importance array
importances = rf_model.feature_importances_

In [13]:
# List features by importance
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.3043162055365098, 'sit and bend forward_cm'),
 (0.15642602921108012, 'sit-ups counts'),
 (0.09073595998458364, 'age'),
 (0.07648571994468843, 'broad jump_cm'),
 (0.07611650043467859, 'gripForce'),
 (0.07518252477168855, 'body fat_%'),
 (0.06741402062495232, 'weight_kg'),
 (0.05319316112482285, 'height_cm'),
 (0.04087101261767892, 'systolic'),
 (0.03668067878208286, 'diastolic'),
 (0.0116179172585974, 'gender_F'),
 (0.01096026970863661, 'gender_M')]