# HSMA Exercise

The data loaded in this exercise is for seven acute stroke units, and whether a patient receives clost-busting treatment for stroke.

How accurately can you predict treatment?

In [1]:
import pandas as pd

# Download data 
# (not required if running locally and have previously downloaded data)

download_required = True

if download_required:
    
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '2004_titanic/master/jupyter_notebooks/data/hsma_stroke.csv'        
    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data to data subfolder
    data.to_csv(data_directory + 'hsma_stroke.csv', index=False)
    
# Load data    
data = pd.read_csv('data/hsma_stroke.csv')
# Make all data 'float' type
data = data.astype(float)
# Show data
data.head()

Unnamed: 0,Clotbuster given,Hosp_1,Hosp_2,Hosp_3,Hosp_4,Hosp_5,Hosp_6,Hosp_7,Male,Age,...,S2NihssArrivalFacialPalsy,S2NihssArrivalMotorArmLeft,S2NihssArrivalMotorArmRight,S2NihssArrivalMotorLegLeft,S2NihssArrivalMotorLegRight,S2NihssArrivalLimbAtaxia,S2NihssArrivalSensory,S2NihssArrivalBestLanguage,S2NihssArrivalDysarthria,S2NihssArrivalExtinctionInattention
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,...,3.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,91.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,...,2.0,0.0,4.0,1.0,4.0,0.0,1.0,2.0,2.0,1.0


In [2]:
import numpy as np
import pandas as pd
# Import machine learning methods
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Add code here to predict whether a patient receives clot-busting treatment
X = data.drop('Clotbuster given',axis=1) # X = all 'data' except the 'survived' column
y = data['Clotbuster given'] # y = 'survived' column from 'data'

In [18]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [19]:
# standardise the data
def standardise_data(X_train, X_test):
    
    # Initialise a new scaling object for normalising input data
    sc = StandardScaler() 

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_std=sc.transform(X_train)
    test_std=sc.transform(X_test)
    
    return train_std, test_std

In [20]:
X_train_std, X_test_std = standardise_data(X_train, X_test)

In [21]:
# fit the model
model = LogisticRegression()
model.fit(X_train_std,y_train)

In [None]:
model.predict

In [22]:
# Predict training and test set labels
y_pred_train = model.predict(X_train_std)
y_pred_test = model.predict(X_test_std)

In [23]:
# calculate accuracy
accuracy_train = np.mean(y_pred_train == y_train)
accuracy_test = np.mean(y_pred_test == y_test)

print ('Accuracy of predicting training data =', accuracy_train)
print ('Accuracy of predicting test data =', accuracy_test)

Accuracy of predicting training data = 0.8126259234385493
Accuracy of predicting test data = 0.8310991957104558


In [13]:
# coefficients
co_eff = model.coef_[0]
co_eff

array([ 0.13889216,  0.10085726, -0.00804651, -0.14575432, -0.13538799,
        0.11793372, -0.08500797,  0.01780714, -0.23869174,  0.00238511,
       -0.28112809,  0.07214423,  0.25848688, -0.01075539, -0.13837288,
       -0.02835033,  0.29657525, -0.19553772,  0.02636493, -0.18584483,
       -0.06317013,  0.06334229, -0.15409454,  0.04737861,  0.37162635,
       -0.27576936, -0.07489539, -0.37452401, -0.66532603,  0.55825227,
        0.23749554, -0.02164619,  1.12492107, -1.12492107, -0.56975486,
       -0.0728938 ,  0.24558167, -0.21371371,  0.06595929, -0.02540564,
        0.176242  ,  0.18546785,  0.10023123,  0.02316549, -0.11176259,
        0.02596318,  0.03380543,  0.35176566,  0.04704562,  0.2022728 ])

In [14]:
co_eff_df = pd.DataFrame() # create empty DataFrame
co_eff_df['feature'] = list(X) # Get feature names from X
co_eff_df['co_eff'] = co_eff
co_eff_df['abs_co_eff'] = np.abs(co_eff)
co_eff_df.sort_values(by='abs_co_eff', ascending=False, inplace=True)

In [15]:
co_eff_df

Unnamed: 0,feature,co_eff,abs_co_eff
32,Stroke Type_I,1.124921,1.124921
33,Stroke Type_PIH,-1.124921,1.124921
28,Stroke severity group_2. Minor,-0.665326,0.665326
34,S2RankinBeforeStroke,-0.569755,0.569755
29,Stroke severity group_3. Moderate,0.558252,0.558252
27,Stroke severity group_1. No stroke symtpoms,-0.374524,0.374524
24,Anticoag before stroke_0,0.371626,0.371626
47,S2NihssArrivalBestLanguage,0.351766,0.351766
16,Hypertension,0.296575,0.296575
10,Onset Time Known Type_BE,-0.281128,0.281128


In [16]:
# Show first ten predicted classes
classes = model.predict(X_test_std)
classes[0:10]

array([0., 0., 1., 0., 0., 1., 1., 0., 0., 0.])

In [17]:
# Show first ten predicted probabilities 
# (note how the values relate to the classes predicted above)
probabilities = model.predict_proba(X_test_std)
probabilities[0:10]

array([[0.88624106, 0.11375894],
       [0.97983265, 0.02016735],
       [0.2116898 , 0.7883102 ],
       [0.90218909, 0.09781091],
       [0.93700537, 0.06299463],
       [0.08793722, 0.91206278],
       [0.20656732, 0.79343268],
       [0.88698373, 0.11301627],
       [0.90997369, 0.09002631],
       [0.5553472 , 0.4446528 ]])