# HSMA Exercise

The data loaded in this exercise is for seven acute stroke units, and whether a patient receives clost-busting treatment for stroke.

How accurately can you predict treatment?

In [6]:
import pandas as pd
import numpy as np
# Import machine learning methods
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Download data 
# (not required if running locally and have previously downloaded data)

download_required = True

if download_required:
    
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '2004_titanic/master/jupyter_notebooks/data/hsma_stroke.csv'        
    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data to data subfolder
    data.to_csv(data_directory + 'hsma_stroke.csv', index=False)

In [3]:
# Load data    
data = pd.read_csv('data/hsma_stroke.csv')
# Make all data 'float' type
data = data.astype(float)
# Show data
data.head()

Unnamed: 0,Clotbuster given,Hosp_1,Hosp_2,Hosp_3,Hosp_4,Hosp_5,Hosp_6,Hosp_7,Male,Age,...,S2NihssArrivalFacialPalsy,S2NihssArrivalMotorArmLeft,S2NihssArrivalMotorArmRight,S2NihssArrivalMotorLegLeft,S2NihssArrivalMotorLegRight,S2NihssArrivalLimbAtaxia,S2NihssArrivalSensory,S2NihssArrivalBestLanguage,S2NihssArrivalDysarthria,S2NihssArrivalExtinctionInattention
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,...,3.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,91.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,...,2.0,0.0,4.0,1.0,4.0,0.0,1.0,2.0,2.0,1.0


In [14]:
X = data.drop('Clotbuster given',axis=1) 
y = data['Clotbuster given']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [16]:
def standardise_data(X_train, X_test):
    
    # Initialise a new scaling object for normalising input data
    sc = StandardScaler() 

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_std=sc.transform(X_train)
    test_std=sc.transform(X_test)
    
    return train_std, test_std

In [17]:
X_train_std, X_test_std = standardise_data(X_train, X_test)

In [18]:
model = LogisticRegression()
model.fit(X_train_std,y_train)

LogisticRegression()

In [19]:
# Predict training and test set labels
y_pred_train = model.predict(X_train_std)
y_pred_test = model.predict(X_test_std)

In [20]:
accuracy_train = np.mean(y_pred_train == y_train)
accuracy_test = np.mean(y_pred_test == y_test)

print ('Accuracy of predicting training data =', accuracy_train)
print ('Accuracy of predicting test data =', accuracy_test)

Accuracy of predicting training data = 0.8193418401611821
Accuracy of predicting test data = 0.8203753351206434


In [21]:
co_eff = model.coef_[0]
co_eff

array([ 0.14171107,  0.07911478, -0.04838052, -0.14490266, -0.0656024 ,
        0.04114454, -0.01747955,  0.04295554, -0.25025477, -0.08888229,
       -0.30533694,  0.04872931,  0.28894839, -0.17617818, -0.02971611,
       -0.02579055,  0.28920765, -0.54842361,  0.05693832, -0.23510511,
        0.00537132,  0.19101201,  0.05236675, -0.19075762,  0.26269543,
       -0.20901247, -0.04814678, -0.36805885, -0.70444566,  0.54011467,
        0.29252822, -0.0105309 ,  1.11899814, -1.11899814, -0.50973059,
       -0.47451149,  0.1985917 , -0.24124451,  0.08524606,  0.07489683,
        0.31638884,  0.0944927 ,  0.1059985 ,  0.22319369, -0.05249798,
       -0.02777819,  0.07068403,  0.51818261,  0.09984129,  0.1851082 ])

In [22]:
co_eff_df = pd.DataFrame() # create empty DataFrame
co_eff_df['feature'] = list(X) # Get feature names from X
co_eff_df['co_eff'] = co_eff
co_eff_df['abs_co_eff'] = np.abs(co_eff)
co_eff_df.sort_values(by='abs_co_eff', ascending=False, inplace=True)

In [23]:
co_eff_df

Unnamed: 0,feature,co_eff,abs_co_eff
33,Stroke Type_PIH,-1.118998,1.118998
32,Stroke Type_I,1.118998,1.118998
28,Stroke severity group_2. Minor,-0.704446,0.704446
17,Atrial Fib,-0.548424,0.548424
29,Stroke severity group_3. Moderate,0.540115,0.540115
47,S2NihssArrivalBestLanguage,0.518183,0.518183
34,S2RankinBeforeStroke,-0.509731,0.509731
35,S2NihssArrival,-0.474511,0.474511
27,Stroke severity group_1. No stroke symtpoms,-0.368059,0.368059
40,S2NihssArrivalFacialPalsy,0.316389,0.316389
