In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
print(os.listdir("data"))

['heart.csv']


In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

# Data splitting/parameter tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


# ML models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Feature processing
from sklearn.feature_selection import SelectPercentile, chi2

# Evaluation metrics
from sklearn.metrics import confusion_matrix

In [7]:
heart_path = "data/heart.csv"
heart_data = pd.read_csv(heart_path)

In [9]:
heart_data = pd.get_dummies(heart_data)
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [10]:
# Getting features and target
X = heart_data.drop(["target"], axis=1)
y = heart_data["target"]

In [12]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
rf_predictions = cross_val_predict(rf_model, X, y, cv=5)
print(confusion_matrix(y, rf_predictions))
rf_scores = cross_val_score(rf_model, X, y, scoring="recall", cv=5)
print("recall:", rf_scores.mean())

[[110  28]
 [ 26 139]]
recall: 0.8484848484848484


Focus on SVM¶
Besides that, SVM is showing the most interesting results. Due to the fact that is already doing a great job on false negatives (but having a hard time knowing when someone is not sick). However I think it is a good idea to choose SVM as main model.

Feature scaling
We are going to go further using SVM, so lets apply feature scaling to our data. That should make our SVM model better. The type of scaling we will apply is called MinMaxScaler in sklearn.

It is important to remember the fact that this is a really good practice to do if we want

In [13]:
X["age"] = X["age"].map(lambda x: (x - X["age"].min()) / (X["age"].max() - X["age"].min()))
X["trestbps"] = X["trestbps"].map(lambda x: (x - X["trestbps"].min()) / (X["trestbps"].max() - X["trestbps"].min()))
X["chol"] = X["chol"].map(lambda x: (x - X["chol"].min()) / (X["chol"].max() - X["chol"].min()))
X["thalach"] = X["thalach"].map(lambda x: (x - X["thalach"].min()) / (X["thalach"].max() - X["thalach"].min()))
X["oldpeak"] = X["oldpeak"].map(lambda x: (x - X["oldpeak"].min()) / (X["oldpeak"].max() - X["oldpeak"].min()))

In [24]:
# Support Vector Machine
svc_model = SVC(gamma="auto")
svc_predictions = cross_val_predict(svc_model, X, y, cv=5)
print(confusion_matrix(y, svc_predictions))
svc_scores = cross_val_score(svc_model, X, y, scoring="recall", cv=5)
print("recall:", svc_scores.mean())

[[ 99  39]
 [ 10 155]]
recall: 0.9393939393939394
