# HSMA Exercise

The data loaded in this exercise is for seven acute stroke units, and whether a patient receives clost-busting treatment for stroke.  There are lots of features, and a description of the features can be found in the file stroke_data_feature_descriptions.csv.

Train a decision tree model to try to predict whether or not a stroke patient receives clot-busting treatment.  Use the prompts below to write each section of code.

## Core Tasks

Run the code below to import the dataset and the libraries we need. 

In [None]:
import pandas as pd
import numpy as np

# import preprocessing functions
from sklearn.model_selection import train_test_split

# Import machine learning model of interest
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
# Import package to investigate our loaded dataframe
from ydata_profiling import ProfileReport

# Import functions for evaluating model
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report, \
                            confusion_matrix, ConfusionMatrixDisplay, auc, roc_curve
from sklearn.metrics import auc, roc_curve, RocCurveDisplay, f1_score, precision_score, \
                            recall_score, confusion_matrix, ConfusionMatrixDisplay, \
                            classification_report
from sklearn.inspection import permutation_importance

# Imports relating to logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Imports relating to plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Download data
# (not required if running locally and have previously downloaded data)

download_required = True

if download_required:

    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '2004_titanic/master/jupyter_notebooks/data/hsma_stroke.csv'
    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data to data subfolder
    data.to_csv(data_directory + 'hsma_stroke.csv', index=False)

# Load data
data = pd.read_csv('data/hsma_stroke.csv')
# Make all data 'float' type
data = data.astype(float)

Look at an overview of the data. Choose whichever method you like.

(e.g. something like the 'head' or 'describe' method from pandas.)

In [None]:
data.head()


Divide the main stroke dataset into features and labels.

Remember - we're trying to predict whether patients are given clotbusting treatment or not.

What column contains that information?

In [None]:
X = data.drop('Clotbuster given', axis=1)
y = data['Clotbuster given'] 

Split the data into training and testing sets. 

Start with a train/test split of 80/20. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
def train_and_display(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()
    return f1_score(y_test, y_pred)

In [None]:
model = DecisionTreeClassifier()

train_and_display(model, X_train, y_train, X_test, y_test)

Fit a random forest model.

In [None]:
model = RandomForestClassifier() # Create a Decision Tree Model
model = model.fit(X_train,y_train) # Fit the model using our training data

Use the trained model to predict labels in both training and test sets.

In [None]:
# Predict training and test set labels
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)





Calculate and compare accuracy across training and test sets.

In [None]:
accuracy_train = np.mean(y_pred_train == y_train)
accuracy_test = np.mean(y_pred_test == y_test)


print(f"Accuracy of predicting training data = {accuracy_train:.3%}")
print(f"Accuracy of predicting testing data = {accuracy_test:.3%}")

Look at the other model metrics.

- precision
- specificity
- recall (sensitivity)
- f1


In [None]:
precision_score_test = precision_score(y_test, y_pred_test)
recall_sensitivity_score_test = recall_score(y_test, y_pred_test)
specificity_score_test = precision_score(y_test, y_pred_test)

print(f"Precision score for testing data = {precision_score_test:.3%}")
print(f"Recall (sensitivity) score for testing data = {recall_sensitivity_score_test:.3%}")
print(f"Specificity score for testing data = {specificity_score_test:.3%}")

In [None]:
f1_score(y_test, y_pred_test, average=None)

Repeat this using the `classification_report` function, returning the output as a dataframe.

In [None]:
pd.DataFrame(classification_report(
    y_true = y_train,
    y_pred = y_pred_train,
    target_names=["Not Given Clotbuster", "Given Clotbuster"],
    output_dict= True
))

Plot a confusion matrix for your model.

In [None]:
confusion_matrix_dt = ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix(
        y_true=y_test,
        y_pred=y_pred_test
        ),
        display_labels=["Not Given Clotbuster", "Given Clotbuster"]
)

fig, ax = plt.subplots(figsize=(14, 5))



confusion_matrix_dt.plot(ax=ax)
ax.title.set_text('Decision Tree')


Plot a normalized confusion matrix for your model.

In [None]:
confusion_matrix_dt = ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix(
        y_true=y_test,
        y_pred=y_pred_test, 
        normalize='true'
        ),
        display_labels=["Not Given Clotbuster", "Given Clotbuster"]
        
)

fig, ax = plt.subplots(figsize=(14, 5))



confusion_matrix_dt.plot(ax=ax)
ax.title.set_text('Decision Tree')


## Part 2 - Refining Your Random Forest

Let's experiment by changing a few parameters.

After changing the parameters, look at the model metrics like accuracy, precision, and recall.

Tweak the parameters to see what model performance you can achieve.

### Maximum Depth

In [None]:
scores=[]

for depth in range(1, 21):
    model = RandomForestClassifier(max_depth=depth) 
    score = train_and_display(model, X_train, y_train, X_test, y_test)
    scores.append(score)

plt.plot(range(1, 21), scores)
plt.xlabel('Max Depth')
plt.ylabel('Score')
plt.title('Score vs. Max Depth')
plt.show()    

### Number of Trees

In [None]:
scores=[]

for n_estimators in range(1, 21):
    model = RandomForestClassifier(n_estimators) 
    score = train_and_display(model, X_train, y_train, X_test, y_test)
    scores.append(score)

plt.plot(range(1, 21), scores)
plt.xlabel('Max Depth')
plt.ylabel('Score')
plt.title('Score vs. Number of trees in the forest')
plt.show()    

## Part 3 - Comparing Performance with a Decision Tree Model

Copy your code in from the previous exercise on decision trees.

If you tuned your decision tree, you can bring in the best-performing of your decision tree models.

In [None]:
model = DecisionTreeClassifier() 

train_and_display(model, X_train, y_train, X_test, y_test)

Look at all of the metrics.

- precision
- specificity
- recall (sensitivity)
- f1

In [None]:
## YOUR CODE HERE

Repeat this using the `classification_report` function, returning the output as a dataframe.

In [None]:
def train_and_report(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report_dict)
    return report_df

model = DecisionTreeClassifier()
report_df = train_and_report(model, X_train, y_train, X_test, y_test)

report_df.head()

Plot a confusion matrix for the decision tree model. 

In [None]:
## YOUR CODE HERE

Plot a normalised confusion matrix for the decision tree model. 

In [None]:
## YOUR CODE HERE

## Extension

### ROC and AUC

Create receiver operating curves (ROC), labelled with the area under the curve (AUC). 

In [None]:
def train_and_report(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report_dict)
    roc_curve = RocCurveDisplay.from_estimator(
    model, X_test, y_test
)

    fig = roc_curve.figure_
    x = roc_curve.ax_
    ax.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    return report_df


### Comparing Performance with a Logistic Regression Model

Copy your code in from last week's logistic regression exercise. 

**Remember - you will need to standardise the data for the logistic regression model!**

Look at all of the metrics.

- precision
- specificity
- recall (sensitivity)
- f1


In [None]:
## YOUR CODE HERE

Plot a confusion matrix for the logistic regression model. 

In [None]:
## YOUR CODE HERE

Plot a normalised confusion matrix for the logistic regression model.

In [None]:
## YOUR CODE HERE

### Comparing all of the models

In the previous exercise, we compared the performance of the logistic regression model and the decision tree model.

Now consider the random forest too. 

Compare and contrast the confusion matrices for each of these.

If one of these models were to be selected, which model would you recommend to put into use, and why?

Remember: giving thrombolysis to good candidates for it can lead to less disability after stroke and improved outcomes. However, there is a risk that giving thrombolysis to the wrong person could lead to additional bleeding on the brain and worse outcomes. What might you want to balance?

You can write your answer into the empty cell below.

## Challenge

### Challenge Exercise 1

Try plotting all of your confusion matrices onto a single matplotlib figure. 

Make sure you give each of these a title.

Hint: You'll need to create multiple matplotlib subplots.

In [None]:
## YOUR CODE HERE

Now do the same for the normalised confusion matrices.

In [None]:
## YOUR CODE HERE

Now do the same for your ROC curves.

In [None]:
## YOUR CODE HERE

### Challenge Exercise 2

Using a random forest gives us another way to look at feature importance in our datasets.

Take a look at this example from the scikit learn documentation. 
https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

Calculate the feature importance using both methods (mean decrease in impurity, and feature permutation) for the dataset we have been working with in this exercise. 

Do they produce the same ranking of feature importance? 

In [None]:
## YOUR CODE HERE

### Challenge Exercise 3
Can you improve accuracy of your random forest by changing the size of your train / test split?  



In [None]:
## YOUR CODE HERE

### Challenge Exercise 4

Try dropping some features from your data.  

Can you improve the performance of your random forest this way?

In [None]:
## YOUR CODE HERE

Try these models and compare the results:

AdaBoost
XGBoost
CatBoost 
You won’t see the full benefits of this as we’ve already one-hot encoded this dataset
Histogram-based gradient boosting
LightGBM


In [None]:
model = AdaBoostClassifier()

AdaBoostClassifier_scores = train_and_report(model, X_train, y_train, X_test, y_test)


model = XGBClassifier()

XGBClassifier_scores = train_and_report(model, X_train, y_train, X_test, y_test)


model = LGBMClassifier()
LGBMClassifier_scores = train_and_report(model, X_train, y_train, X_test, y_test)

model = HistGradientBoostingClassifier()
HistGradientBoostingClassifier_scores = train_and_report(model, X_train, y_train, X_test, y_test)




In [None]:
model1 = AdaBoostClassifier()
model2 = XGBClassifier()
model3 = LGBMClassifier()
model4 = HistGradientBoostingClassifier()
model5 = RandomForestClassifier()



models = [model1, model2, model3, model4, model5]
model_names = ['AdaBoost', 'XGB', 'LGBM', 'HistGradientBoosting', 'RandomForest']
colors = ['blue', 'green', 'red', 'purple', 'orange']
linestyles = ['-', '--', '-.', ':', '-']

plt.figure(figsize=(10, 8))

for model, name, color, linestyle in zip(models, model_names, colors, linestyles):
    model.fit(X_train, y_train)
    y_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, color=color, linestyle=linestyle,
             lw=2, label=f'{name} (area = {roc_auc:.2f})')

# Plot ROC curve for a random model (chance)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Chance')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()