# PART 1: DEPENDENCIES & LOADING DATA





In [153]:
# Importing dependencies 

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report 



In [154]:
# Reading CSV file from the Resources folder into a pandas DataFrame 

file_path = Path("Resources/bodyPerformance.csv")

body_performance_df = pd.read_csv(file_path)

body_performance_df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


# PART 2: CLEANING DATA 

In [155]:
# Datatypes 

body_performance_df.dtypes

age                        float64
gender                      object
height_cm                  float64
weight_kg                  float64
body fat_%                 float64
diastolic                  float64
systolic                   float64
gripForce                  float64
sit and bend forward_cm    float64
sit-ups counts             float64
broad jump_cm              float64
class                       object
dtype: object

In [156]:
# Unique values 
body_performance_df.nunique()

age                          44
gender                        2
height_cm                   467
weight_kg                  1398
body fat_%                  527
diastolic                    89
systolic                    102
gripForce                   550
sit and bend forward_cm     528
sit-ups counts               81
broad jump_cm               245
class                         4
dtype: int64

In [157]:
# Dropping unwanted columns
body_performance_df.drop(columns=['gripForce', 'sit and bend forward_cm', 'sit-ups counts', 'broad jump_cm'], inplace=True)

In [158]:
# Renaming columns 
body_performance_df.rename(columns={
    'age': 'Age',
    'gender': 'Gender',
    'height_cm': 'Height (cm)',
    'weight_kg': 'Weight (Kg)',
    'body fat_%': 'Body Fat %',
    'diastolic': 'Diastolic BP',
    'systolic': 'Systolic BP',
    'class': 'Classification'
}, inplace=True)

body_performance_df.head()
                            

Unnamed: 0,Age,Gender,Height (cm),Weight (Kg),Body Fat %,Diastolic BP,Systolic BP,Classification
0,27.0,M,172.3,75.24,21.3,80.0,130.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,B


# PART 3: CREATE LABELS SET (y) AND FEATURES (X)

In [159]:
# Creating new DataFrame for transformed non-numeric data to numeric, Boolean data. 

body_performance_numeric_df = pd.get_dummies(body_performance_df, columns=['Gender'], dtype=int)

# Displaying the first few rows
body_performance_numeric_df.head()

Unnamed: 0,Age,Height (cm),Weight (Kg),Body Fat %,Diastolic BP,Systolic BP,Classification,Gender_F,Gender_M
0,27.0,172.3,75.24,21.3,80.0,130.0,C,0,1
1,25.0,165.0,55.8,15.7,77.0,126.0,A,0,1
2,31.0,179.6,78.0,20.1,92.0,152.0,C,0,1
3,32.0,174.5,71.1,18.4,76.0,147.0,B,0,1
4,28.0,173.8,67.7,17.1,70.0,127.0,B,0,1


In [160]:
# Initiating the LabelEncoder

le = LabelEncoder()

# Applying LabelEncoder to the classification column (the labels)

body_performance_numeric_df['Classification'] = le.fit_transform(body_performance_numeric_df['Classification'].values)

In [161]:
# Separating data into labels and features 

y = body_performance_numeric_df['Classification']

X = body_performance_numeric_df.drop(columns=['Classification'])

In [162]:
# Classification column contains: A, B, C, D
# A=Great_Health B=Good_Health C=Average_Health D=Need_more_training 
y.head()

0    2
1    0
2    2
3    1
4    1
Name: Classification, dtype: int32

In [163]:
y.value_counts()

Classification
2    3349
3    3349
0    3348
1    3347
Name: count, dtype: int64

In [164]:
X.head()

Unnamed: 0,Age,Height (cm),Weight (Kg),Body Fat %,Diastolic BP,Systolic BP,Gender_F,Gender_M
0,27.0,172.3,75.24,21.3,80.0,130.0,0,1
1,25.0,165.0,55.8,15.7,77.0,126.0,0,1
2,31.0,179.6,78.0,20.1,92.0,152.0,0,1
3,32.0,174.5,71.1,18.4,76.0,147.0,0,1
4,28.0,173.8,67.7,17.1,70.0,127.0,0,1


In [165]:
X.value_counts()

Age   Height (cm)  Weight (Kg)  Body Fat %  Diastolic BP  Systolic BP  Gender_F  Gender_M
27.0  157.0        49.10        30.7        70.0          86.0         1         0           2
21.0  148.0        42.00        22.8        62.0          104.0        1         0           1
42.0  165.1        54.40        14.5        82.0          146.0        0         1           1
      162.8        60.10        27.4        82.0          129.0        1         0           1
      163.0        57.60        24.1        98.0          148.0        1         0           1
                                                                                            ..
27.0  166.4        67.70        22.2        70.0          134.0        0         1           1
      166.5        58.50        19.7        79.0          125.0        1         0           1
                   71.92        37.7        78.0          133.0        1         0           1
      166.6        66.40        37.8        85.0       

# PART 4: SPLIT DATA INTO TRAINING AND TESTING SETS

In [166]:
# Splitting data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.shape

(10044, 8)

# PART 5: LOGISTIC REGRESSION MODEL 

In [167]:
# Scaling the features using the StandardScaler method
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiating the Logistic Regression Model 
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fitting the Logistic Regression Model using the scaled data 
classifier.fit(X_train_scaled, y_train)

In [168]:
# Making prediction using the testing subset "X_test"

predictions = classifier.predict(X_test)
prediction_results = pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).reset_index(drop=True)

prediction_results.head()



Unnamed: 0,Prediction,Actual
0,3,3
1,3,0
2,3,2
3,3,2
4,3,3


# PART 6: CONFUSION MATRIX 

In [172]:
# Generating a confusion matrix 

matrix_data = confusion_matrix(y_test, predictions)

pd.DataFrame(matrix_data, columns=["Actual 0", "Actual 1", "Actual 2", "Actual 3"], index=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3"])

Unnamed: 0,Actual 0,Actual 1,Actual 2,Actual 3
Predicted 0,0,0,4,853
Predicted 1,0,0,1,823
Predicted 2,0,0,0,800
Predicted 3,0,0,1,867


In [173]:
matrix_data

array([[  0,   0,   4, 853],
       [  0,   0,   1, 823],
       [  0,   0,   0, 800],
       [  0,   0,   1, 867]], dtype=int64)

# PART 7: CLASSIFICATION REPORT

In [174]:
target_names = ["Excellent Health", "Good Health", "Average Health", "Need more training"]
print(classification_report(y_test, predictions, target_names=target_names))

                    precision    recall  f1-score   support

  Excellent Health       0.00      0.00      0.00       857
       Good Health       0.00      0.00      0.00       824
    Average Health       0.00      0.00      0.00       800
Need more training       0.26      1.00      0.41       868

          accuracy                           0.26      3349
         macro avg       0.06      0.25      0.10      3349
      weighted avg       0.07      0.26      0.11      3349



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
