# HSMA Exercise

The data loaded in this exercise is for seven acute stroke units, and whether a patient receives clost-busting treatment for stroke.

How accurately can you predict treatment?

In [35]:
# Import modules
import numpy as np
import pandas as pd
# Import machine learning methods
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import explained_variance_score

In [4]:
# Download data 
# (not required if running locally and have previously downloaded data)

download_required = True

if download_required:
    
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '2004_titanic/master/jupyter_notebooks/data/hsma_stroke.csv'        
    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data to data subfolder
    data.to_csv(data_directory + 'hsma_stroke.csv', index=False)
    
# Load data    
data = pd.read_csv('data/hsma_stroke.csv')
# Make all data 'float' type
data = data.astype(float)


In [9]:
# Show data
data.head()


Unnamed: 0,Clotbuster given,Hosp_1,Hosp_2,Hosp_3,Hosp_4,Hosp_5,Hosp_6,Hosp_7,Male,Age,...,S2NihssArrivalFacialPalsy,S2NihssArrivalMotorArmLeft,S2NihssArrivalMotorArmRight,S2NihssArrivalMotorLegLeft,S2NihssArrivalMotorLegRight,S2NihssArrivalLimbAtaxia,S2NihssArrivalSensory,S2NihssArrivalBestLanguage,S2NihssArrivalDysarthria,S2NihssArrivalExtinctionInattention
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,...,3.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,91.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,...,2.0,0.0,4.0,1.0,4.0,0.0,1.0,2.0,2.0,1.0


In [6]:
data.describe()

Unnamed: 0,Clotbuster given,Hosp_1,Hosp_2,Hosp_3,Hosp_4,Hosp_5,Hosp_6,Hosp_7,Male,Age,...,S2NihssArrivalFacialPalsy,S2NihssArrivalMotorArmLeft,S2NihssArrivalMotorArmRight,S2NihssArrivalMotorLegLeft,S2NihssArrivalMotorLegRight,S2NihssArrivalLimbAtaxia,S2NihssArrivalSensory,S2NihssArrivalBestLanguage,S2NihssArrivalDysarthria,S2NihssArrivalExtinctionInattention
count,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,...,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0
mean,0.40333,0.159506,0.14232,0.154672,0.165414,0.055854,0.113319,0.208915,0.515575,74.553706,...,1.11493,1.002148,0.96348,0.96348,0.910849,0.216971,0.610097,0.944146,0.739527,0.566595
std,0.490698,0.366246,0.349472,0.361689,0.371653,0.229701,0.317068,0.406643,0.499892,12.280576,...,0.930527,1.479211,1.441594,1.406501,1.380606,0.522643,0.771932,1.121379,0.731083,0.794
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,76.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,83.0,...,2.0,2.0,2.0,2.0,2.0,0.0,1.0,2.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,...,3.0,4.0,4.0,4.0,4.0,2.0,2.0,3.0,2.0,2.0


In [14]:
mask = data['Clotbuster given'] == 1 # Mask for passengers who survive
treatment = data[mask] # filter using mask

mask = data['Clotbuster given'] == 0 # Mask for passengers who died
no_treatment = data[mask] # filter using mask

In [17]:
summary = pd.DataFrame() # New empty DataFrame
summary['treatment'] = treatment.mean()
summary['no treatment'] = no_treatment.mean()
summary

Unnamed: 0,treatment,no treatment
Clotbuster given,1.0,0.0
Hosp_1,0.203728,0.129613
Hosp_2,0.122503,0.155716
Hosp_3,0.182423,0.135914
Hosp_4,0.13715,0.184518
Hosp_5,0.067909,0.047705
Hosp_6,0.123835,0.106211
Hosp_7,0.16245,0.240324
Male,0.515313,0.515752
Age,73.303595,75.39874


In [18]:
X = data.drop('Clotbuster given',axis=1) # X = all 'data' except the 'survived' column
y = data['Clotbuster given'] # y = 'survived' column from 'data'

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [21]:
summary2 = pd.DataFrame() # New empty DataFrame
summary2['X_train_std'] = X_train.std()
summary2['X_train_mean'] = X_train.mean()
summary2

Unnamed: 0,X_train_std,X_train_mean
Hosp_1,0.370445,0.16404
Hosp_2,0.344536,0.137536
Hosp_3,0.362457,0.155444
Hosp_4,0.376195,0.170487
Hosp_5,0.221249,0.051576
Hosp_6,0.320395,0.116046
Hosp_7,0.403752,0.204871
Male,0.499476,0.526504
Age,12.314106,74.472063
80+,0.482877,0.369628


In [36]:
def standardise_data(X_train, X_test):
    
    # Initialise a new scaling object for normalising input data
    sc = StandardScaler()

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_std=sc.transform(X_train)
    test_std=sc.transform(X_test)
    
    return train_std, test_std

def minmax_normalisation_data(X_train, X_test):
    # Initialise a new scaling object for normalising input data
    scaler = MinMaxScaler()
    
    # Set up the scaler just on the training set
    scaler.fit(X_train)
    
    train_minmax = scaler.transform(X_train)
    test_minmax = scaler.transform(X_test)
    
    return train_minmax, test_minmax

# Min Max Normalised

In [37]:
X_train_minmaxnorm, X_test_minmaxnorm = minmax_normalisation_data(X_train, X_test)

In [38]:
model_norm = LogisticRegression()
model_norm.fit(X_train_minmaxnorm,y_train)

LogisticRegression()

In [39]:
y_pred_train_minmax = model.predict(X_train_minmaxnorm)
y_pred_test_minmax = model.predict(X_test_minmaxnorm)

In [41]:
accuracy_train_minmax = np.mean(y_pred_train_minmax == y_train)
accuracy_test_minmax = np.mean(y_pred_test_minmax == y_test)

print ('Accuracy of predicting training data =', accuracy_train_minmax)
print ('Accuracy of predicting test data =', accuracy_test_minmax)

Accuracy of predicting training data = 0.8015759312320917
Accuracy of predicting test data = 0.7811158798283262


In [42]:
co_eff_norm = model_norm.coef_[0]
co_eff_norm

array([ 0.55218581, -0.01991011, -0.17879103, -0.44696859, -0.20362789,
        0.3295876 , -0.02761973,  0.14825339, -0.81431322, -0.28598982,
       -0.936483  ,  0.24118902,  0.70015003, -0.14191035, -0.21912452,
       -0.14650685,  0.4549562 , -0.53569579,  0.04737641, -0.38777135,
       -0.22520324,  0.4564013 , -0.29939341, -0.15215183,  0.89252828,
       -0.76158393, -0.12608829, -1.06995505, -1.16278626,  1.44407862,
        0.8707002 , -0.07718144,  2.57208115, -2.5672251 , -1.93478392,
        0.03130902,  0.22825142, -0.2943785 ,  0.08924189,  0.07706406,
        0.8993364 ,  0.43436525,  0.16872381,  0.0371874 , -0.24281921,
       -0.39603824,  0.1504876 ,  1.05798199,  0.22499495,  0.40120716])

In [44]:
co_eff_df_norm = pd.DataFrame() # create empty DataFrame
co_eff_df_norm['feature'] = list(X) # Get feature names from X
co_eff_df_norm['co_eff'] = co_eff_norm
co_eff_df_norm['abs_co_eff'] = np.abs(co_eff_norm)
co_eff_df_norm.sort_values(by='abs_co_eff', ascending=False, inplace=True)
co_eff_df_norm

Unnamed: 0,feature,co_eff,abs_co_eff
32,Stroke Type_I,2.572081,2.572081
33,Stroke Type_PIH,-2.567225,2.567225
34,S2RankinBeforeStroke,-1.934784,1.934784
29,Stroke severity group_3. Moderate,1.444079,1.444079
28,Stroke severity group_2. Minor,-1.162786,1.162786
27,Stroke severity group_1. No stroke symtpoms,-1.069955,1.069955
47,S2NihssArrivalBestLanguage,1.057982,1.057982
10,Onset Time Known Type_BE,-0.936483,0.936483
40,S2NihssArrivalFacialPalsy,0.899336,0.899336
24,Anticoag before stroke_0,0.892528,0.892528


# Standardised

In [23]:
X_train_std, X_test_std = standardise_data(X_train, X_test)

In [24]:
model = LogisticRegression()
model.fit(X_train_std,y_train)

LogisticRegression()

In [25]:
y_pred_train = model.predict(X_train_std)
y_pred_test = model.predict(X_test_std)

In [26]:
accuracy_train = np.mean(y_pred_train == y_train)
accuracy_test = np.mean(y_pred_test == y_test)

print ('Accuracy of predicting training data =', accuracy_train)
print ('Accuracy of predicting test data =', accuracy_test)

Accuracy of predicting training data = 0.8316618911174785
Accuracy of predicting test data = 0.7746781115879828


In [27]:
co_eff = model.coef_[0]
co_eff

array([ 0.21296577, -0.00996322, -0.08777833, -0.19248883, -0.05792815,
        0.11826717,  0.00914973,  0.07391729, -0.22331967, -0.10500647,
       -0.39130191,  0.05431669,  0.37337509, -0.06179449, -0.11254966,
       -0.02478354,  0.27086484, -0.31523156,  0.0336932 , -0.16119408,
       -0.11906053,  0.19895726, -0.08657845, -0.10111335,  0.31794947,
       -0.24041189, -0.06619108, -0.31617678, -0.68876702,  0.59710755,
        0.24588784, -0.08079699,  1.16910406, -1.16910406, -0.52065317,
       -0.39848766,  0.14309137, -0.08021247,  0.05957886,  0.066153  ,
        0.33942369,  0.2476726 ,  0.13221446,  0.06556658, -0.03309872,
       -0.09649676,  0.0864252 ,  0.48313931,  0.10685416,  0.19905903])

In [31]:
explained_variance_score(y_train, y_pred_train)

0.30187024757418235

In [28]:
co_eff_df = pd.DataFrame() # create empty DataFrame
co_eff_df['feature'] = list(X) # Get feature names from X
co_eff_df['co_eff'] = co_eff
co_eff_df['abs_co_eff'] = np.abs(co_eff)
co_eff_df.sort_values(by='abs_co_eff', ascending=False, inplace=True)

In [29]:
co_eff_df

Unnamed: 0,feature,co_eff,abs_co_eff
33,Stroke Type_PIH,-1.169104,1.169104
32,Stroke Type_I,1.169104,1.169104
28,Stroke severity group_2. Minor,-0.688767,0.688767
29,Stroke severity group_3. Moderate,0.597108,0.597108
34,S2RankinBeforeStroke,-0.520653,0.520653
47,S2NihssArrivalBestLanguage,0.483139,0.483139
35,S2NihssArrival,-0.398488,0.398488
10,Onset Time Known Type_BE,-0.391302,0.391302
12,Onset Time Known Type_P,0.373375,0.373375
40,S2NihssArrivalFacialPalsy,0.339424,0.339424


In [45]:
# Show first ten predicted classes
classes = model.predict(X_test_std)
classes[0:10]

array([0., 0., 0., 0., 0., 0., 0., 1., 1., 1.])

In [46]:
# Show first ten predicted probabilities
# (note how the values relate to the classes predicted above)
probabilities = model.predict_proba(X_test_std)
probabilities[0:10]

array([[9.84203034e-01, 1.57969659e-02],
       [9.33619430e-01, 6.63805697e-02],
       [9.21285899e-01, 7.87141009e-02],
       [9.99714802e-01, 2.85198077e-04],
       [8.93359133e-01, 1.06640867e-01],
       [9.88940022e-01, 1.10599785e-02],
       [9.94576837e-01, 5.42316330e-03],
       [2.42978479e-01, 7.57021521e-01],
       [2.19826910e-01, 7.80173090e-01],
       [4.35018347e-02, 9.56498165e-01]])