# HSMA Exercise

The data loaded in this exercise is for seven acute stroke units, and whether a patient receives clost-busting treatment for stroke.

How accurately can you predict treatment?

In [16]:
import numpy as np
# Import machine learning methods
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [1]:
import pandas as pd

# Download data 
# (not required if running locally and have previously downloaded data)

download_required = True

if download_required:
    
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '2004_titanic/master/jupyter_notebooks/data/hsma_stroke.csv'        
    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data to data subfolder
    data.to_csv(data_directory + 'hsma_stroke.csv', index=False)
    
# Load data    
data = pd.read_csv('data/hsma_stroke.csv')
# Make all data 'float' type
data = data.astype(float)
# Show data
data.head()

Unnamed: 0,Clotbuster given,Hosp_1,Hosp_2,Hosp_3,Hosp_4,Hosp_5,Hosp_6,Hosp_7,Male,Age,...,S2NihssArrivalFacialPalsy,S2NihssArrivalMotorArmLeft,S2NihssArrivalMotorArmRight,S2NihssArrivalMotorLegLeft,S2NihssArrivalMotorLegRight,S2NihssArrivalLimbAtaxia,S2NihssArrivalSensory,S2NihssArrivalBestLanguage,S2NihssArrivalDysarthria,S2NihssArrivalExtinctionInattention
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,...,3.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,91.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,...,2.0,0.0,4.0,1.0,4.0,0.0,1.0,2.0,2.0,1.0


In [3]:
data.columns

Index(['Clotbuster given', 'Hosp_1', 'Hosp_2', 'Hosp_3', 'Hosp_4', 'Hosp_5',
       'Hosp_6', 'Hosp_7', 'Male', 'Age', '80+', 'Onset Time Known Type_BE',
       'Onset Time Known Type_NK', 'Onset Time Known Type_P',
       '# Comorbidities', '2+ comorbidotes', 'Congestive HF', 'Hypertension',
       'Atrial Fib', 'Diabetes', 'TIA', 'Co-mordity', 'Antiplatelet_0',
       'Antiplatelet_1', 'Antiplatelet_NK', 'Anticoag before stroke_0',
       'Anticoag before stroke_1', 'Anticoag before stroke_NK',
       'Stroke severity group_1. No stroke symtpoms',
       'Stroke severity group_2. Minor', 'Stroke severity group_3. Moderate',
       'Stroke severity group_4. Moderate to severe',
       'Stroke severity group_5. Severe', 'Stroke Type_I', 'Stroke Type_PIH',
       'S2RankinBeforeStroke', 'S2NihssArrival', 'S2NihssArrivalLocQuestions',
       'S2NihssArrivalLocCommands', 'S2NihssArrivalBestGaze',
       'S2NihssArrivalVisual', 'S2NihssArrivalFacialPalsy',
       'S2NihssArrivalMotorArmLef

In [4]:
data.describe()

Unnamed: 0,Clotbuster given,Hosp_1,Hosp_2,Hosp_3,Hosp_4,Hosp_5,Hosp_6,Hosp_7,Male,Age,...,S2NihssArrivalFacialPalsy,S2NihssArrivalMotorArmLeft,S2NihssArrivalMotorArmRight,S2NihssArrivalMotorLegLeft,S2NihssArrivalMotorLegRight,S2NihssArrivalLimbAtaxia,S2NihssArrivalSensory,S2NihssArrivalBestLanguage,S2NihssArrivalDysarthria,S2NihssArrivalExtinctionInattention
count,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,...,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0,1862.0
mean,0.40333,0.159506,0.14232,0.154672,0.165414,0.055854,0.113319,0.208915,0.515575,74.553706,...,1.11493,1.002148,0.96348,0.96348,0.910849,0.216971,0.610097,0.944146,0.739527,0.566595
std,0.490698,0.366246,0.349472,0.361689,0.371653,0.229701,0.317068,0.406643,0.499892,12.280576,...,0.930527,1.479211,1.441594,1.406501,1.380606,0.522643,0.771932,1.121379,0.731083,0.794
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,76.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,83.0,...,2.0,2.0,2.0,2.0,2.0,0.0,1.0,2.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,...,3.0,4.0,4.0,4.0,4.0,2.0,2.0,3.0,2.0,2.0


In [5]:
little_data = data[['Clotbuster given', 'Hosp_1', 'Hosp_2', 'Hosp_3', 'Hosp_4', 'Hosp_5',
       'Hosp_6', 'Hosp_7', 'Male', 'Age', '80+', 'Onset Time Known Type_BE',
       'Onset Time Known Type_NK', 'Onset Time Known Type_P',
       '# Comorbidities', '2+ comorbidotes', 'Congestive HF', 'Hypertension',
       'Atrial Fib', 'Diabetes', 'TIA', 'Co-mordity', 'Antiplatelet_0',
       'Antiplatelet_1', 'Antiplatelet_NK', 'Anticoag before stroke_0',
       'Anticoag before stroke_1', 'Anticoag before stroke_NK',
       'Stroke severity group_1. No stroke symtpoms',
       'Stroke severity group_2. Minor', 'Stroke severity group_3. Moderate',
       'Stroke severity group_4. Moderate to severe',
       'Stroke severity group_5. Severe',]]

In [6]:
little_data.head()

Unnamed: 0,Clotbuster given,Hosp_1,Hosp_2,Hosp_3,Hosp_4,Hosp_5,Hosp_6,Hosp_7,Male,Age,...,Antiplatelet_1,Antiplatelet_NK,Anticoag before stroke_0,Anticoag before stroke_1,Anticoag before stroke_NK,Stroke severity group_1. No stroke symtpoms,Stroke severity group_2. Minor,Stroke severity group_3. Moderate,Stroke severity group_4. Moderate to severe,Stroke severity group_5. Severe
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,91.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [8]:
little_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1862 entries, 0 to 1861
Data columns (total 33 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Clotbuster given                             1862 non-null   float64
 1   Hosp_1                                       1862 non-null   float64
 2   Hosp_2                                       1862 non-null   float64
 3   Hosp_3                                       1862 non-null   float64
 4   Hosp_4                                       1862 non-null   float64
 5   Hosp_5                                       1862 non-null   float64
 6   Hosp_6                                       1862 non-null   float64
 7   Hosp_7                                       1862 non-null   float64
 8   Male                                         1862 non-null   float64
 9   Age                                          1862 non-null   float64
 10  

In [None]:
# Add code here to predict whether a patient receives clot-busting treatment

In [10]:
mask = data['Clotbuster given'] == 1 # Mask for passengers who survive
treated = data[mask] # filter using mask

mask = data['Clotbuster given'] == 0 # Mask for passengers who died
not_treated = data[mask] # filter using mask

In [11]:
summary = pd.DataFrame() # New empty DataFrame
summary['Clotbuster given'] = treated.mean()
summary['Clotbuster not given'] = not_treated.mean()

In [12]:
summary

Unnamed: 0,Clotbuster given,Clotbuster not given
Clotbuster given,1.0,0.0
Hosp_1,0.203728,0.129613
Hosp_2,0.122503,0.155716
Hosp_3,0.182423,0.135914
Hosp_4,0.13715,0.184518
Hosp_5,0.067909,0.047705
Hosp_6,0.123835,0.106211
Hosp_7,0.16245,0.240324
Male,0.515313,0.515752
Age,73.303595,75.39874


## Train Test Split

In [13]:
X = data.drop('Clotbuster given',axis=1) # X = all 'data' except the 'survived' column
y = data['Clotbuster given'] # y = 'survived' column from 'data'

In [14]:
X.head()

Unnamed: 0,Hosp_1,Hosp_2,Hosp_3,Hosp_4,Hosp_5,Hosp_6,Hosp_7,Male,Age,80+,...,S2NihssArrivalFacialPalsy,S2NihssArrivalMotorArmLeft,S2NihssArrivalMotorArmRight,S2NihssArrivalMotorLegLeft,S2NihssArrivalMotorLegRight,S2NihssArrivalLimbAtaxia,S2NihssArrivalSensory,S2NihssArrivalBestLanguage,S2NihssArrivalDysarthria,S2NihssArrivalExtinctionInattention
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,0.0,...,3.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,91.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,0.0,...,2.0,0.0,4.0,1.0,4.0,0.0,1.0,2.0,2.0,1.0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [18]:
X_train.std(), X_train.mean()

(Hosp_1                                          0.371740
 Hosp_2                                          0.350470
 Hosp_3                                          0.358320
 Hosp_4                                          0.374300
 Hosp_5                                          0.231141
 Hosp_6                                          0.317800
 Hosp_7                                          0.401107
 Male                                            0.499974
 Age                                            12.336375
 80+                                             0.485118
 Onset Time Known Type_BE                        0.444888
 Onset Time Known Type_NK                        0.109718
 Onset Time Known Type_P                         0.450939
 # Comorbidities                                 0.984472
 2+ comorbidotes                                 0.477005
 Congestive HF                                   0.197969
 Hypertension                                    0.499316
 Atrial Fib   

## Standardisation

In [19]:
def standardise_data(X_train, X_test):
    
    # Initialise a new scaling object for normalising input data
    sc = StandardScaler() 

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_std=sc.transform(X_train)
    test_std=sc.transform(X_test)
    
    return train_std, test_std

In [20]:
X_train_std, X_test_std = standardise_data(X_train, X_test)

In [21]:
X_train_std

array([[-0.44529009, -0.40893041, -0.42197046, ...,  0.03412327,
        -1.01954604, -0.72056702],
       [-0.44529009, -0.40893041, -0.42197046, ..., -0.84802633,
         0.34637169,  1.77239844],
       [-0.44529009, -0.40893041,  2.36983412, ..., -0.84802633,
         1.71228942,  1.77239844],
       ...,
       [-0.44529009, -0.40893041, -0.42197046, ...,  0.03412327,
         0.34637169,  0.52591571],
       [-0.44529009, -0.40893041, -0.42197046, ..., -0.84802633,
        -1.01954604, -0.72056702],
       [-0.44529009, -0.40893041, -0.42197046, ..., -0.84802633,
         0.34637169,  0.52591571]])

## Fit Logistic Regression

In [22]:
model = LogisticRegression()
model.fit(X_train_std,y_train)

LogisticRegression()

## Predict

In [23]:
# Predict training and test set labels
y_pred_train = model.predict(X_train_std)
y_pred_test = model.predict(X_test_std)

## Calulate accuracy

In [24]:
# The shorthand below says to check each predicted y value against the actual
# y value in the training data.  This gives a list of True and False values
# for each prediction, where True indicates the predicted value matches the
# actual value.  Then we take the mean of these Boolean values, which gives
# us a proportion (where if all values were True, the proportion would be 1.0)
# If you want to see why that works, just uncomment the following line of code
# to see what y_pred_train == y_train is doing.
# print (y_pred_train == y_train)
accuracy_train = np.mean(y_pred_train == y_train)
accuracy_test = np.mean(y_pred_test == y_test)

print ('Accuracy of predicting training data =', accuracy_train)
print ('Accuracy of predicting test data =', accuracy_test)

1430     True
1034     True
1241     True
1820     True
33       True
        ...  
945      True
97       True
1387     True
1577     True
1089    False
Name: Clotbuster given, Length: 1396, dtype: bool
Accuracy of predicting training data = 0.8230659025787965
Accuracy of predicting test data = 0.7832618025751072


## Examining Weights

In [25]:
co_eff = model.coef_[0]
co_eff

array([ 0.10417836,  0.07899944, -0.01360597, -0.15323323, -0.04174171,
        0.12874606, -0.08838254,  0.12366111, -0.29178961, -0.09188567,
       -0.34043177,  0.10868063,  0.30942029, -0.02860158, -0.08986108,
        0.06812322,  0.25949458, -0.23347166,  0.02541468, -0.18792801,
       -0.08417152,  0.11485553, -0.0539418 , -0.0592824 ,  0.27964211,
       -0.21879714, -0.04740841, -0.22293997, -0.58525907,  0.59507239,
        0.17543742, -0.20361421,  1.17742134, -1.17742134, -0.47514405,
       -0.56643419,  0.34146365, -0.17613591,  0.15706388,  0.09993351,
        0.23980595,  0.13462278,  0.07744585,  0.33878971,  0.10095579,
       -0.08266187,  0.0999766 ,  0.55172367,  0.16206527,  0.21689433])

In [26]:
co_eff_df = pd.DataFrame() # create empty DataFrame
co_eff_df['feature'] = list(X) # Get feature names from X
co_eff_df['co_eff'] = co_eff
co_eff_df['abs_co_eff'] = np.abs(co_eff)
co_eff_df.sort_values(by='abs_co_eff', ascending=False, inplace=True)

In [29]:
data[['Stroke Type_I', 'Stroke Type_PIH']]

Unnamed: 0,Stroke Type_I,Stroke Type_PIH
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
1857,0.0,1.0
1858,1.0,0.0
1859,1.0,0.0
1860,1.0,0.0


In [27]:
co_eff_df

Unnamed: 0,feature,co_eff,abs_co_eff
32,Stroke Type_I,1.177421,1.177421
33,Stroke Type_PIH,-1.177421,1.177421
29,Stroke severity group_3. Moderate,0.595072,0.595072
28,Stroke severity group_2. Minor,-0.585259,0.585259
35,S2NihssArrival,-0.566434,0.566434
47,S2NihssArrivalBestLanguage,0.551724,0.551724
34,S2RankinBeforeStroke,-0.475144,0.475144
36,S2NihssArrivalLocQuestions,0.341464,0.341464
10,Onset Time Known Type_BE,-0.340432,0.340432
43,S2NihssArrivalMotorLegLeft,0.33879,0.33879
