This notebook compares features to determine which are more important with classification.

The goal of this is to simplify the dataset by reducing dimensionality. I will use Logistic Regression as the model.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from clean_data import HeartDisease

df = HeartDisease().modeMissing()
df = HeartDisease.reduceSick(df)
df.describe()

Unnamed: 0,age,sex,cp,testbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.663366,4.722772,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.934375,1.938383,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,1.0


In [2]:
# Define features and target. Scale features using StandardScaler
features = df.drop('num', axis='columns')
target = df['num']

x_scaled = HeartDisease.scaleFeatures(features)

In [3]:
# Split into train/test sets
from sklearn.model_selection import train_test_split
testSize = 0.2
x_train, x_test, y_train, y_test = train_test_split(x_scaled, target, test_size=testSize, random_state=1)

In [4]:
# Let's try using Extra Trees to order features by importance. Note these features are not scaled, unlike above
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier()
et.fit(x_train, y_train)
imp_dict = {}
for i in range(len(features.columns)):
  imp_dict[et.feature_importances_[i]] = features.columns[i]

# Order the columns by |coeff| (least important -> most important)
imp_ordered = et.feature_importances_
imp_ordered.sort()
col_by_importance = [imp_dict[x] for x in imp_ordered]
print('Least to most important features:', col_by_importance)


Least to most important features: ['fbs', 'restecg', 'sex', 'testbps', 'chol', 'slope', 'age', 'thalach', 'oldpeak', 'exang', 'cp', 'ca', 'thal']


In [5]:
# Perform logreg while increasing how many features are included in the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')

for i in range(len(features.columns)-1, -1, -1):
  dropcols = col_by_importance[:i]
  x_imp = features.drop(dropcols, axis='columns')

  x_train_imp, x_test_imp, y_train_imp, y_test_imp = train_test_split(x_imp, target, test_size=testSize, random_state=1)
  model.fit(x_train_imp, y_train_imp)
  print(len(x_imp.columns), model.score(x_test_imp, y_test_imp))

# The printed results show how many features are included and its R^2 value

1 0.7704918032786885
2 0.7704918032786885
3 0.7868852459016393
4 0.8524590163934426
5 0.8360655737704918
6 0.8360655737704918
7 0.8360655737704918
8 0.8524590163934426
9 0.8524590163934426
10 0.8360655737704918
11 0.819672131147541
12 0.8360655737704918
13 0.8360655737704918
