# Logistic regression on the body data set (with validation)
Math 395 Learning from Large Data Sets 

Fall 2024, Hope College

In [1]:
import pandas as pd
import numpy as np
from scipy import linalg as LA
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler 

## load the data set, separate response from features, and scale the features

In [2]:
df = pd.read_table('bodydata.txt', header = None, sep='\s+')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,42.9,26.0,31.5,17.7,28.0,13.1,10.4,18.8,14.1,106.2,...,32.5,26.0,34.5,36.5,23.5,16.5,21.0,65.6,174.0,1
1,43.7,28.5,33.5,16.9,30.8,14.0,11.8,20.6,15.1,110.5,...,34.4,28.0,36.5,37.5,24.5,17.0,23.0,71.8,175.3,1
2,40.1,28.2,33.3,20.9,31.7,13.9,10.9,19.7,14.1,115.1,...,33.4,28.8,37.0,37.3,21.9,16.9,28.0,80.7,193.5,1
3,44.3,29.9,34.0,18.4,28.2,13.9,11.2,20.9,15.0,104.5,...,31.0,26.2,37.0,34.8,23.0,16.6,23.0,72.6,186.5,1
4,42.5,29.9,34.0,21.5,29.4,15.2,11.6,20.7,14.9,107.5,...,32.0,28.4,37.7,38.6,24.4,18.0,22.0,78.8,187.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,38.0,30.4,32.9,17.0,27.1,12.9,10.4,19.5,14.4,108.4,...,30.3,25.4,37.7,37.9,22.4,15.4,29.0,71.8,176.5,0
503,35.3,28.7,30.4,17.7,25.6,12.4,9.8,17.3,13.6,99.3,...,30.1,23.6,35.6,33.3,22.4,15.2,21.0,55.5,164.4,0
504,34.7,24.9,24.7,17.3,24.2,12.0,10.2,18.0,13.6,91.9,...,27.4,24.0,34.4,34.1,21.2,15.5,33.0,48.6,160.7,0
505,38.5,29.0,32.9,15.3,25.6,12.0,9.8,18.6,13.3,107.1,...,30.6,24.9,38.4,36.6,22.0,15.5,33.0,66.4,174.0,0


In [3]:
y = df[[24]].values.ravel() # sex

X = df[df.columns.drop([24])]  # matrix of all features (without weight)
X_norm = StandardScaler().fit_transform(X) ## standardize features

X.shape, y.shape

((507, 24), (507,))

## randomly partition the data into training (80%) and test (20%)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=14)

n_train = X_train.shape[0]
n_test = X_test.shape[0]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((405, 24), (405,), (102, 24), (102,))

## Compute training and validation errors of different logistic regression models 

### sex ~ thigh girth + forearm girth

In [5]:
logisticModel_thigh_forearm = LogisticRegression(C=1e5).fit(X_train[:, [14,16]], y_train)

pred_train = logisticModel_thigh_forearm.predict(X_train[:,[14,16]]) 
train_error_thigh_forearm = np.sum(np.abs(pred_train-y_train)) / n_train
print('training error = ', train_error_thigh_forearm)

pred_test = logisticModel_thigh_forearm.predict(X_test[:,[14,16]]) 
test_error_thigh_forearm = np.sum(np.abs(pred_test-y_test)) / n_test
print('validation error = ', test_error_thigh_forearm)

training error =  0.024691358024691357
validation error =  0.049019607843137254


In [6]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, pred_train), confusion_matrix(y_test, pred_test)

(array([[203,   4],
        [  6, 192]]),
 array([[51,  2],
        [ 3, 46]]))

### sex ~ height

### sex ~ height+weight

### sex ~ all the features

### $\ell_1$-regularized logistic regression with all features ($C=0.1$)

## plot training and validation errors together