This notebook compares the performance of the following classification models:
  1. Logistic Regression
  2. SVM
  3. Decision Tree
  4. Random Forest
  5. GaussianNB
  6. MultinomialNB
  7. BernoulliNB
  8. XG Boost

Each model has tuned hyperparameters, if applicable. The comparison uses GridSearchCV with 5 sampling splits. For now, the target was reduced to values 0 (Healthy) and 1 (Sick) rather than the expanded Sick classification.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from clean_data import HeartDisease

df = HeartDisease().modeMissing()
df = HeartDisease.reduceSick(df)
df.describe()

Unnamed: 0,age,sex,cp,testbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.663366,4.722772,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.934375,1.938383,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,1.0


In [2]:
# Define features and target
features = df.drop('num', axis='columns')
target = df['num']

In [3]:
# Define models and their parameters
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from xgboost import XGBClassifier

models = {
  'logistic_regression': {
    'model': LogisticRegression(solver='liblinear'),
    'params': {
      'C': [1, 5, 10] # Default: 1
    }
  }, 
  'svm': {
    'model': SVC(),
    'params': {
      'C': [1, 5, 10], # Default: 1
      'kernel': ['rbf', 'linear', 'sigmoid'], # Default: rbf
      'gamma': ['scale', 'auto'] # Default: scale
    }
  },
  'decision_tree': {
    'model': DecisionTreeClassifier(),
    'params': {
      'criterion': ['gini', 'entropy'] # Default: gini
    }
  },
  'random_forest': {
    'model': RandomForestClassifier(),
    'params': {
      'n_estimators': [50, 100, 200], # Default: 100
      'criterion': ['gini', 'entropy'] # Default: gini
    }
  },
  'gaussian_nb': {
    'model': GaussianNB(),
    'params': {
      # No params
    }
  },
  'multinomial_nb': {
    'model': MultinomialNB(),
    'params': {
      # No params
    }
  },
  'bernoulli_nb': {
    'model': BernoulliNB(),
    'params': {
      # No params
    }
  },
  'xg_boost': {
    'model': XGBClassifier(),
    'params': {
      'n_estimators': [50, 100, 200],
    }
  }
}

In [4]:
# Use GridSearchCV for each model and report best scores
from sklearn.model_selection import GridSearchCV
scores = []
for model, props in models.items():
  gs = GridSearchCV(
    props['model'],
    props['params'],
    cv=5,
    return_train_score=False
  )
  gs.fit(features, target)
  scores.append({
    'model': model,
    'best_score': gs.best_score_,
    'best_params': gs.best_params_
  })

# Report scores as a dataframe
results = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
results

Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.831585,{'C': 1}
1,svm,0.818361,"{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}"
2,decision_tree,0.765574,{'criterion': 'gini'}
3,random_forest,0.838087,"{'criterion': 'gini', 'n_estimators': 100}"
4,gaussian_nb,0.831639,{}
5,multinomial_nb,0.758962,{}
6,bernoulli_nb,0.798743,{}
7,xg_boost,0.801749,{'n_estimators': 100}
