Trying to classify heart disease data without reducing target to boolean (i.e. modeling the complete dataset)

In [1]:
import pandas as pd
import numpy as np
from clean_data import HeartDisease

df = HeartDisease().modeMissing()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   age      303 non-null    float64
 1   sex      303 non-null    float64
 2   cp       303 non-null    float64
 3   testbps  303 non-null    float64
 4   chol     303 non-null    float64
 5   fbs      303 non-null    float64
 6   restecg  303 non-null    float64
 7   thalach  303 non-null    float64
 8   exang    303 non-null    float64
 9   oldpeak  303 non-null    float64
 10  slope    303 non-null    float64
 11  ca       303 non-null    float64
 12  thal     303 non-null    float64
 13  num      303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB


In [10]:
# Split train/test sets with 20% testing and perform logistic regression
x = df.drop(columns='num')
y = df.num

from sklearn.model_selection import train_test_split
test_size = 0.2
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=2)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear')
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

0.639344262295082

In [11]:
# Let's try xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=5)
xgb.fit(x_train, y_train)
xgb.score(x_test, y_test)

0.6557377049180327

In [14]:
# What if we scale x?
x_scaled = HeartDisease.scaleFeatures(x)
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(x_scaled, y, test_size=test_size, random_state=2)

from sklearn.metrics import confusion_matrix
lr.fit(x_train_scaled, y_train_scaled)
print(lr.score(x_test_scaled, y_test_scaled))
y_pred_lr = lr.predict(x_test_scaled)
print(confusion_matrix(y_pred_lr, y_test_scaled))

xgb.fit(x_train_scaled, y_train_scaled)
print(xgb.score(x_test_scaled, y_test_scaled))
y_pred_xgb = xgb.predict(x_test_scaled)
print(confusion_matrix(y_pred_xgb, y_test_scaled))

0.6721311475409836
[[35  7  2  0  0]
 [ 0  2  0  0  1]
 [ 1  2  0  0  0]
 [ 1  2  3  4  0]
 [ 0  0  0  1  0]]
0.6557377049180327
[[34  6  1  0  0]
 [ 1  2  1  1  1]
 [ 0  1  2  0  0]
 [ 2  3  0  2  0]
 [ 0  1  1  2  0]]
