In [1]:
import pandas as pd
import numpy as np

from lib.logistic_regression_scratch import LogisticRegressionScratch

In [2]:
data = pd.read_csv("data/bodyPerformance.csv")

In [3]:
data.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [4]:
# map gender classes to 0 - female, 1 - male
data['gender'] = data['gender'].map({'F' : 0, 'M' : 1})

# map class from A, B, C, D -> 3, 2, 1, 0 where A(3) is the best
data['class'] = data['class'].map({"A": 3, "B": 2, "C": 1, "D": 0})

In [5]:
data.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,1,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,1
1,25.0,1,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,3
2,31.0,1,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,1
3,32.0,1,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,2
4,28.0,1,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,2


In [6]:
data.corr()['class']

age                       -0.065612
gender                    -0.075605
height_cm                 -0.037753
weight_kg                 -0.214129
body fat_%                -0.341956
diastolic                 -0.066761
systolic                  -0.035484
gripForce                  0.136088
sit and bend forward_cm    0.588123
sit-ups counts             0.452832
broad jump_cm              0.262154
class                      1.000000
Name: class, dtype: float64

The more correlated columns are __sit and bend forward_cm__ and __sit-ups counts__

- Sit and bend represents the degree of flexibility of the body which help in various exercises.
- Sit-ups represents the power in legs, taking in consideration that humans are two legs standing animals the power in legs influence how fast they move how stable they can stay.

### Plan 
Again I will split the homework into correlated data and raw one.

# Correlated values
correlated columns with correlation greater than 0.4

In [7]:
# create a subset with correlated values

correlated_data = data.copy()
columns_to_drop = data.columns.drop(["class", "sit and bend forward_cm", "sit-ups counts"])

correlated_data = correlated_data.drop(columns=columns_to_drop.values)

In [8]:
# separate data into train and test

from sklearn.model_selection import train_test_split

X_corr = correlated_data.copy().drop(columns=['class']).to_numpy()
y_corr = correlated_data['class'].to_numpy()

X, X_test, y, y_test =  train_test_split(X_corr, y_corr, test_size=0.3, random_state=42)

In [9]:
from sklearn.linear_model import LogisticRegression

model_corr_sklearn = LogisticRegression(max_iter=10000)

model_corr_sklearn.fit(X, y)

In [10]:
# from scratch implementation

model_corr_scratch = LogisticRegressionScratch(learning_rate=1e-7, max_iter=1000)

model_corr_scratch.fit(X, y)

<lib.logistic_regression_scratch.LogisticRegressionScratch at 0x7efd60c66300>

In [11]:
# test sklearn train

from sklearn.metrics import accuracy_score

y_pred = model_corr_sklearn.predict(X_test)
accuracy_score(y_test, y_pred)


0.5288700846192136

In [12]:
# test scratch train

y_pred = model_corr_scratch.predict(X_test)

accuracy_score(y_test, y_pred)

0.24315579890492783

# Raw data

In [13]:
# split data

X = data.drop(columns=["class"]).to_numpy()
y = data["class"].to_numpy()

X, X_test, y, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [14]:
# train using sklearn

clf = LogisticRegression(max_iter=10000)

clf.fit(X, y)

In [15]:
# train using scracth implementaion

scratch = LogisticRegressionScratch(learning_rate=1e-7)

scratch.fit(X, y)

<lib.logistic_regression_scratch.LogisticRegressionScratch at 0x7efd5fdcabd0>

In [16]:
# test sklearn accuracy

y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.6217023394723743

In [17]:
# test scratch 
y_pred = scratch.predict(X_test)

accuracy_score(y_test, y_pred)

0.24116475858636138

### Conclusion

|                    | acurracy |
| ------------------ | -------- |
| scratch_correlated | 0.24     |
| sklearn_correlated | 0.52     |
| scratch_raw        | 0.24     |
| sklearn_raw        | 0.62     |
I don't know why the scratch implementation have such bad results. I tried to fix it I really don't know what is wrong. But judging by sklearn results correlated data are not enough for predicting with the best accuracy. 