# Import libraries and set up default plot params

#### Note, this cell picks the path from which you want to load tha data and to which you want to save all figures as your current working directory (`cwd`).
#### If you want to load from/save to a different path, edit the `path`.

In [None]:
# Import libraries
import sys
import os

path = os.getcwd()

from collections import Counter

import matplotlib
import matplotlib.pyplot as plt

import numpy as np

import pandas as pd

import scipy.stats
from scipy.stats import binned_statistic_2d

# Import machine learning libraries
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PowerTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss, make_scorer
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline

# Set default tick label size
matplotlib.rcParams.update({'xtick.labelsize': 16})
matplotlib.rcParams.update({'ytick.labelsize': 16})

# Read in the data

In [None]:
# Read in the csv file
df = pd.read_csv(path + '/' + 'haberman.data.csv')

#### Divide the columns into inputs (age, year, nodes) and outputs (state), and encode the output/target variable (y) to have values 0 and 1

In [None]:
data = df.values

# split into input and output elements
X, y = data[:, :-1], data[:, -1]

# label encode the target variable to have the classes 0 and 1
y = LabelEncoder().fit_transform(y)


# Functions for evaluating the skill of the model

#### Use the Brier score, which calculates the mean squared error between the model predicted probabilities and the probabilities expected from the reference dataset
We calculate the reference dataset Brier score, where per_pos represents the expected baseline performance for the predictive model, and the model Brier score

We then calculate the model skill score, by comparing the Brier score for the reference and the model
By default, a skill score of 0.0 is a perfect score, but we invert such that 1.0 is a perfect score, and a score of 0.0 means the model performs exactly as well as the reference

In [None]:
# Calculate Brier skill score (BSS)
# Use as a metric for evaluating the skill of the model based on the returned predicted probabilities
def brier_skill_score(y_true, y_prob):
    # Calculate Brier score for the reference (i.e., the dataset)
    ref_probs = [per_pos for _ in range(len(y_true))]
    bs_ref = brier_score_loss(y_true, ref_probs)
    # Calculate Brier score for the predictive model
    bs_model = brier_score_loss(y_true, y_prob)
    # Calculate skill score, by comparing the Brier score for the reference and the model
    print(1.0 - (bs_model / bs_ref))
    return 1.0 - (bs_model / bs_ref)


#### Use cross-validation, which uses a limited sample of a dataset in order to estimate the skill of a model, or how the model is expected to perform when used to make predictions on data not used in the training step

In [None]:
# Evaluate the model
def evaluate_model(X, y, model):
    # k-fold cross-validation
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # Tell the scorer what metric to use
    metric = make_scorer(brier_skill_score, needs_proba=True)
    # Evaluate model
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
    return scores


# Define the final model


In [None]:
# Create a pipeline and fit the model
model = Pipeline(steps=[('t1', MinMaxScaler()), ('t2', PowerTransformer()),('m',LogisticRegression(solver='lbfgs'))])
model.fit(X, y)

print('Model fit complete.')


# Apply the model to a new patient

### Requires user to input patient data

In [None]:
age = float(input('What was the age of the patient at the time of the operation?: '))
year = float(input('In what year was the surgery performed?: ')) - 1900.0
nodes = float(input('How many positive axillary nodes were detected in the patient?: '))
# some new cases
data = [[age,year,nodes]]
for row in data:
    # Make model prediction
    yhat = model.predict_proba([row])
    # Probability of survival
    p_survive = yhat[0, 0] * 100
    # Summarize
    print('\n')
    print('\n')
    print('A {0:.0f}-year-old patient undergoing the breast cancer surgery in {1:.0f}'.format(age, year + 1900.0))
    print('with {0:.0f} positive axillary nodes detected has a {1:.2f}% chance of long-term survival.'.format(nodes, p_survive))
    

### Some more test cases

#### Here, we'll check a few patients, pulled directly from the dataset, that we know survived (state 1) or did not survive (state 2) for longer than 5 years after their surgery. The model will output the probability of survival for each case.

In [None]:
print('Test Cases:')
data = [[66,58,0], [52,69,3], [36,60,1], [38,60,0], [53,59,3]] #1,2,1,1,2 
for row in data:
    # Make model prediction
    yhat = model.predict_proba([row])
    # Probability of survival
    p_survive = yhat[0, 0] * 100
    # Summarize
    print('data=%s, Survival=%.3f%%' % (row, p_survive))
    

#### Make some plots showing the entire parameters space on which the model was trained

In [None]:
# Generate the entire parameter space
ages = np.arange(np.min(df['AGE']), np.max(df['AGE'])+1, 1)
years = np.arange(np.min(df['YEAR']), np.max(df['YEAR'])+1, 1)
nodes = np.arange(np.min(df['NODES']), np.max(df['NODES'])+1, 1)

prob = []
for i,a in enumerate(ages):
    for j,y in enumerate(years):
        for k,n in enumerate(nodes):
            # Get patient data
            row = [a,y,k]
            # Make model prediction
            yhat = model.predict_proba([row])
            # Probability of survival, as a fraction from 0.0 to 1.0, and save results
            p_survive = yhat[0, 0]
            prob.append([a,y,k,p_survive])

In [None]:
fig, axarr = plt.subplots(2,2, figsize=(18,16))

fig.subplots_adjust(wspace=0.1,hspace=0.1)

ages_model = []
year_model = []
node_model = []
prob_model = []
for i,p in enumerate(prob):
    ages_model.append(p[0])
    year_model.append(p[1])
    node_model.append(p[2])
    prob_model.append(p[3])

ret_ay = binned_statistic_2d(ages_model, year_model, prob_model, statistic=np.median, bins=[ages, years])
im = axarr[0,0].imshow(ret_ay.statistic.T, origin='bottom', 
                       cmap='RdYlGn', aspect='auto', extent=(np.min(ages), np.max(ages), np.min(years), np.max(years)),
                       vmin=0.25, vmax=1.0, norm=matplotlib.colors.LogNorm())
axarr[0,0].text(33,68,'$[1]$',fontsize=20)

ret_an = binned_statistic_2d(ages_model, node_model, prob_model, statistic=np.median, bins=[ages, nodes])
axarr[1,0].imshow(ret_an.statistic.T, origin='bottom', 
                       cmap='RdYlGn', aspect='auto', extent=(np.min(ages), np.max(ages), np.min(nodes), np.max(nodes)),
                       vmin=0.25, vmax=1.0, norm=matplotlib.colors.LogNorm())
axarr[1,0].text(33,47,'$[2]$',fontsize=20)
    
ret_yn = binned_statistic_2d(year_model, node_model, prob_model, statistic=np.median, bins=[years, nodes])
axarr[1,1].imshow(ret_yn.statistic.T, origin='bottom', 
                       cmap='RdYlGn', aspect='auto', extent=(np.min(years), np.max(years), np.min(nodes), np.max(nodes)),
                       vmin=0.25, vmax=1.0, norm=matplotlib.colors.LogNorm())
axarr[1,1].text(59,47,'$[3]$',fontsize=20)

axarr[0,1].axis('off')

axarr[0,0].set_ylabel('Year of surgery',fontsize=20)
axarr[1,0].set_xlabel('Age of patient at time of surgery',fontsize=20)
axarr[1,0].set_ylabel('Number of positive axillary nodes detected',fontsize=20)
axarr[1,1].set_xlabel('Year of surgery',fontsize=20)

cbar = fig.colorbar(im, ax=axarr.ravel().tolist(), ticks=[0.3, 0.4, 0.6, 1.0], pad=0.025)
cbar.set_label(label='Probability of long-term survival', fontsize=20)
cbar.ax.set_yticklabels(['30%', '40%', '60%', '100%'])

plt.savefig(path + '/' + 'haberman_corner_survival.pdf', fig=fig, dpi=300)
