# HSMA Exercise

The data loaded in this exercise is for seven acute stroke units, and whether a patient receives clost-busting treatment for stroke.  There are lots of features, and a description of the features can be found in the file stroke_data_feature_descriptions.csv.

Train a Logistic Regression model to try to predict whether or not a stroke patient receives clot-busting treatment.  Use the prompts below to write each section of code.

What do you conclude are the most important features for predicting whether a patient receives clot busting treatment?  Can you improve accuracy by changing the size of your train / test split?  If you have time, perhaps consider dropping some features from your data based on your outputs (in the same way you dropped passengerID in the Titanic example).  Don't forget you'll need to rerun all subsequent cells if you make changes like that.

In [None]:
import pandas as pd
import numpy as np
# Import machine learning methods
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Download data 
# (not required if running locally and have previously downloaded data)

download_required = True

if download_required:
    
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '2004_titanic/master/jupyter_notebooks/data/hsma_stroke.csv'        
    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data to data subfolder
    data.to_csv(data_directory + 'hsma_stroke.csv', index=False)
    
# Load data    
data = pd.read_csv('data/hsma_stroke.csv')
# Make all data 'float' type
data = data.astype(float)
# Show data
data.head()

In [None]:
# Look at overview of data
data.describe()

In [None]:
# Look at mean feature values for those who were given a clotbuster vs those
# that weren't
mask = data['Clotbuster given'] == 1
given = data[mask]

mask = data['Clotbuster given'] == 0
not_given = data[mask]

summary = pd.DataFrame()
summary['given'] = given.mean()
summary['not given'] = not_given.mean()

summary

In [None]:
# Divide into features and labels
X = data.drop('Clotbuster given', axis=1)
y = data['Clotbuster given']

In [None]:
# Divide into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
# Standardise data
def standardise_data(X_train, X_test):
    
    # Initialise a new scaling object for normalising input data
    sc = StandardScaler() 

    # Apply the scaler to the training and test sets
    train_std=sc.fit_transform(X_train)
    test_std=sc.fit_transform(X_test)
    
    return train_std, test_std

X_train_std, X_test_std = standardise_data(X_train, X_test)

In [None]:
# Fit (train) Logistic Regression model
model = LogisticRegression()
model.fit(X_train_std, y_train)

In [None]:
# Predict training and test labels, and calculate accuracy
y_pred_train = model.predict(X_train_std)
y_pred_test = model.predict(X_test_std)

accuracy_train = np.mean(y_pred_train == y_train)
accuracy_test = np.mean(y_pred_test == y_test)

print (f'Accuracy of predicting training data = {accuracy_train}')
print (f'Accuracy of predicting test data = {accuracy_test}')

In [None]:
# Examine feature weights and sort by most influential
co_eff = model.coef_[0]

co_eff_df = pd.DataFrame()
co_eff_df['feature'] = list(X)
co_eff_df['co_eff'] = co_eff
co_eff_df['abs_co_eff'] = np.abs(co_eff)
co_eff_df.sort_values(by='abs_co_eff', ascending=False, inplace=True)

co_eff_df