In [1]:
%matplotlib inline

In [2]:
from utils import *

# Dataset

#### South African Heart Disease Dataset

https://web.stanford.edu/~hastie/ElemStatLearn/data.html

A retrospective sample of males in a heart-disease high-risk region
of the Western Cape, South Africa. There are roughly two controls per
case of CHD. Many of the CHD positive men have undergone blood
pressure reduction treatment and other programs to reduce their risk
factors after their CHD event. In some cases the measurements were
made after these treatments. These data are taken from a larger
dataset, described in  Rousseauw et al, 1983, South African Medical
Journal. 

In [None]:
heart_disease_data = pd.read_csv('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/SAheart.data')
heart_disease_data.head(5)

In [None]:
# Use more user-friendly column names
columns = [
    'Patient ID',
    'Systolic Blood Pressure', 
    'Tobacco (kg)', 
    'LDL', 
    'Adiposity', 
    'Fam. Hist', 
    'Type A', 
    'Obesity', 
    'Alcohol', 
    'Age',
    'Coronary Heart Disease'
]

heart_disease_data.columns = columns
heart_disease_data.head(5)

In [None]:
# features
f1 = 'Age'
f2 = 'Systolic Blood Pressure'

# patient id
patient_id = 'Patient ID'

# target variable
target = 'Coronary Heart Disease'

Take a sample of 100 instances to form the training dataset

In [None]:
data = heart_disease_data.sample(100, random_state=43)[[patient_id, f1, f2, target]]
data[target] = data[target].apply(lambda x: False if not x else True)

In [None]:
data.sort_values(f2)

# Train a first simple ML model

In [None]:
from sklearn.linear_model import LogisticRegression

linear_model = LogisticRegression(solver='lbfgs')

X = data[[f1, f2]]
y = data[target]
linear_model.fit(X.values, y)

In [None]:
plot_data_and_decision_boundary(data, f1, f2, target, linear_model)

## Look at the parameters

In [None]:
# w
linear_model.coef_

In [None]:
# b
linear_model.intercept_

## Prediction on unseen cases

In [None]:
def logit(v): return 1 / (1 + np.exp(-v))

In [None]:
logit(linear_model.coef_.dot(np.array([62, 158])) + linear_model.intercept_)

## Evaluation

For simplicity, we will evaluate the model on the training set, which is not a good indicator for future performance.
In the next week, we will talk about test sets and model selection.

In [None]:
y_pred = linear_model.predict(X.values)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

display = ConfusionMatrixDisplay.from_predictions(y, y_pred)
confusion_matrix = display.confusion_matrix

In [None]:
tp = confusion_matrix[1,1]
fp = confusion_matrix[0,1]
fn = confusion_matrix[1,0]
tn = confusion_matrix[0,0]

### Accuracy

In [None]:
(tp + tn) / (fp + fn + tp + tn)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y, y_pred)

## Precision

In [None]:
tp / (tp + fp)

In [None]:
from sklearn.metrics import precision_score
precision_score(y, y_pred)

## Recall

In [None]:
tp / (tp + fn)

In [None]:
from sklearn.metrics import recall_score
recall_score(y, y_pred)

## F1 score

In [None]:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
(2 * precision * recall) / (precision + recall)

In [None]:
from sklearn.metrics import f1_score
f1_score(y, y_pred)

## ROC Curve

In [None]:
from sklearn.metrics import RocCurveDisplay

In [None]:
RocCurveDisplay.from_estimator(linear_model, X.values, y)