# In this project we will build a Machine Learning model to predict whether an indiviudal will have a stroke.  The data used in this project can be found on kaggle at the following link: https://www.kaggle.com/asaumya/healthcare-data#train_2v.csv

# In this notebook, we build our Machine Learning model.  In our initial data analysis, we noticed that the individuals who had a stroke make up approximately 1.8% of the data.  We will use the Synthetic Minority Oversampling Technique (SMOTE) to account for this.

# To view our initial data analysis, please see the notebook titled "Data_Analysis."

In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

# Define file path to our data
stoke_data_relevant_features_and_label_file_path = os.path.join("..", "Data", "stroke_data_relevant_features_and_label.csv")

# Create dataframe from local csv file 
stroke_data_relevant_features_and_label = pd.read_csv(stoke_data_relevant_features_and_label_file_path)

# Previe dataframe
stroke_data_relevant_features_and_label.head()

Unnamed: 0,hypertension,heart_disease,ever_married,work_type,smoking_status,age,average_glucose_level,stroke
0,0,0,0,children,,3.0,95.12,0
1,1,0,1,employer_employed,never smoked,58.0,87.96,0
2,0,0,0,employer_employed,,8.0,110.89,0
3,0,0,1,employer_employed,smokes,70.0,69.04,0
4,0,0,0,employer_employed,,14.0,161.28,0


In [3]:
# Transform data to one hot encoded data
machine_ready_stroke_data = pd.get_dummies(stroke_data_relevant_features_and_label, columns=["hypertension", "heart_disease", "ever_married", "work_type", "smoking_status"])
machine_ready_stroke_data.head()

Unnamed: 0,age,average_glucose_level,stroke,hypertension_0,hypertension_1,heart_disease_0,heart_disease_1,ever_married_0,ever_married_1,work_type_Self-employed,work_type_children,work_type_employer_employed,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,3.0,95.12,0,1,0,1,0,1,0,0,1,0,0,0,0
1,58.0,87.96,0,0,1,1,0,0,1,0,0,1,0,1,0
2,8.0,110.89,0,1,0,1,0,1,0,0,0,1,0,0,0
3,70.0,69.04,0,1,0,1,0,0,1,0,0,1,0,0,1
4,14.0,161.28,0,1,0,1,0,1,0,0,0,1,0,0,0


In [4]:
# Import Maching Learning algorithm LogisticRegression
from sklearn.linear_model import LogisticRegression

# Import other essential Machine Learning functions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import SMOTE to handle the imbalanced data issue
from imblearn.over_sampling import SMOTE

In [5]:
# Create our features and label
X = np.array(machine_ready_stroke_data.drop(["stroke"], axis=1))
y = np.array(machine_ready_stroke_data["stroke"].values.reshape(-1,1))

### In the following section, we will run a for-loop to examine what order of SMOTE, split, scale (<em>SSS order</em>) yields the best results.  We will ignore any SSS order that scales before it splits, as this could bias the model.

#### Logistic Regression

In [6]:
import warnings
warnings.filterwarnings('ignore')

# We want to determine which order of SMOTE, split, scale (SSS order) is best for this model
# We will use the mean and stdev methods of the statistics library,
# to find out which SSS order yields the highest average accuracy, and which order is the most stable (lowest standard deviation)
from statistics import mean, stdev

# Define variables holding the value for the each argument,
# in order to easily change it in multiple places

## SMOTE() parameters
sampling_strategy_argument = 0.2
k_neighbors_argument = 2

## train_test_split() parameters
test_size_argument = 0.15
random_state_argument = 50

## LogisticRegresssion() parameters
solver_argument = "liblinear"
C_argument = 0.00001

# For every iteration in the loop,
# we will append the accuracy of the current SSS order to it's own distinct list
# After the loop has finished, we will calculate the average of each list
# The list with the highest average we will call "the most accuracte (on average)"
# we will also calculate the standard deviation of each list
# The list with the lowest standard deviation we will call "the most stable"
SSS_order_1_list = []
SSS_order_2_list = []
SSS_order_3_list = []

for i in range(3):
    
    # Print the current iteration of the loop,
    # in case we use a large number of iterations
    print(f"Iteration {i+1}", "\n")
    
    # Print the SSS order so we can analyze which one is "best"
    print("1. SMOTE, split, scale")
        
    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=k_neighbors_argument)
    X_SMOTE, y_SMOTE = smote.fit_sample(X, y.ravel())
    y_SMOTE = y_SMOTE.reshape(-1,1)
    
    # Split the data into training and testing sets
    X_SMOTE_train, X_SMOTE_test, y_SMOTE_train, y_SMOTE_test = train_test_split(X_SMOTE, y_SMOTE, test_size=test_size_argument, random_state=random_state_argument)
    
    # Create scaler for features
    X_scaler = StandardScaler().fit(X_SMOTE_train)
    
    # Scale features
    X_SMOTE_train_scaled = X_scaler.transform(X_SMOTE_train)
    X_SMOTE_test_scaled = X_scaler.transform(X_SMOTE_test)
    
    # Create, fit, and score the Decision Tree Classifier
    classifier = LogisticRegression(solver=solver_argument, C=C_argument)
    classifier = classifier.fit(X=X_SMOTE_train_scaled, y=y_SMOTE_train)
    score = classifier.score(X_SMOTE_test_scaled, y_SMOTE_test)
    
    # Append the score the the SSS_order_1 list,
    # So that we can determine the average accuracy and standard deviation of SSS order 1
    SSS_order_1_list.append(score)
    
    # Print the accuracy for the current iteration
    print(f"Accuracy: {score}")
    
####################################################################################################
    
    # Print the SSS order so we can analyze which one is "best"
    print("2. split, SMOTE, scale")
        
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_argument, random_state=random_state_argument)
    
    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=k_neighbors_argument)
    X_train_SMOTE, y_train_SMOTE = smote.fit_sample(X_train, y_train.ravel())
    y_train_SMOTE = y_train_SMOTE.reshape(-1,1)
    
    # Create scaler for features
    X_SMOTE_scaler = StandardScaler().fit(X_train_SMOTE)
    
    # Scale features
    X_train_SMOTE_scaled = X_SMOTE_scaler.transform(X_train_SMOTE)
    X_test_scaled = X_SMOTE_scaler.transform(X_test)
    
    # Create, fit, and score the Decision Tree Classifier
    classifier = LogisticRegression(solver=solver_argument, C=C_argument)
    classifier = classifier.fit(X=X_train_SMOTE_scaled, y=y_train_SMOTE)
    score = classifier.score(X_test_scaled, y_test)
    
    # Append the score the the SSS_order_2 list,
    # So that we can determine the average accuracy and standard deviation of SSS order 2
    SSS_order_2_list.append(score)
    
    # Print the accuracy for the current iteration
    print(f"Accuracy: {score}")

####################################################################################################

    # Print the SSS order so we can analyze which one is "best"
    print("3. split, scale, SMOTE")
        
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_argument, random_state=random_state_argument)
    
    # Create scaler for features
    X_scaler = StandardScaler().fit(X_train)
    
    # Scale features
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=k_neighbors_argument)
    X_train_scaled_SMOTE, y_train_SMOTE = smote.fit_sample(X_train_scaled, y_train.ravel())
    y_train_SMOTE = y_train_SMOTE.reshape(-1,1)

    # Create, fit, and score the Decision Tree Classifier
    classifier = LogisticRegression(solver=solver_argument, C=C_argument)
    classifier = classifier.fit(X=X_train_scaled_SMOTE, y=y_train_SMOTE)
    score = classifier.score(X_test_scaled, y_test)
    
    # Append the score the the SSS_order_3 list,
    # So that we can determine the average accuracy and standard deviation of SSS order 3
    SSS_order_3_list.append(score)
    
    # Print the accuracy for the current iteration
    print(f"Accuracy: {score}")

####################################################################################################
    
    # Print a long line with blank lines above and below,
    # to easily see where one iteration of the loop ends, and the next starts
    print()
    print(100*"-")
    print()
    
    # Increase the iterator by one
    # so that the print statement at the beginning will show we're on the next iteration
    i += 1

####################################################################################################

# Find the average accuracy of each SSS order,
# and add each average to a list
average_1 = mean(SSS_order_1_list)
average_2 = mean(SSS_order_2_list)
average_3 = mean(SSS_order_3_list)
averages_list = [average_1, average_2, average_3]

# Use conditionals to determine which SSS order has the highest average accuracy
if max(averages_list) == averages_list[0]:
    most_accurate_order = 1
    average_accuracy_greatest = averages_list[0]
    
elif max(averages_list) == averages_list[1]:
    most_accurate_order = 2
    average_accuracy_greatest = averages_list[1]
    
elif max(averages_list) == averages_list[2]:
    most_accurate_order = 3
    average_accuracy_greatest = averages_list[2]

# Print a message showing which SSS order has the highest average accuracy, along with it's accuracy
print(f"The most accurate order (highest average accuracy) is order {most_accurate_order}, with an average accuracy of {average_accuracy_greatest}")
        
####################################################################################################

# Find the standard deviation of the accuracy of each SSS order,
# and add each standard deviation to a list
standard_deviation_1 = stdev(SSS_order_1_list)
standard_deviation_2 = stdev(SSS_order_2_list)
standard_deviation_3 = stdev(SSS_order_3_list)
standard_deviations_list = [standard_deviation_1, standard_deviation_2, standard_deviation_3]

# Use conditionals to determine which SSS order has the lowest standard deviation
if min(standard_deviations_list) == standard_deviations_list[0]:
    most_stable_order = 1
    lowest_standard_deviation = standard_deviations_list[0]
    
elif min(standard_deviations_list) == standard_deviations_list[1]:
    most_stable_order = 2
    lowest_standard_deviation = standard_deviations_list[1]
    
elif min(standard_deviations_list) == standard_deviations_list[2]:
    most_stable_order = 3
    lowest_standard_deviation = standard_deviations_list[2]
    
# Print a message showing which SSS order has the highest average accuracy, along with it's accuracy
print(f"The most stable order (lowest standard deviation) is order {most_stable_order}, with a standard deviation of {lowest_standard_deviation}")

####################################################################################################

# Print a blank line to separate the lines showing us the "best" orders from the lines showing us the "worst" orders
print()

# Use conditionals to determine which SSS order has the lowest average accuracy
if min(averages_list) == averages_list[0]:
    least_accurate_order = 1
    average_accuracy_least = averages_list[0]
    
elif min(averages_list) == averages_list[1]:
    least_accurate_order = 2
    average_accuracy_least = averages_list[1]
    
elif min(averages_list) == averages_list[2]:
    least_accurate_order = 3
    average_accuracy_least = averages_list[2]

# Print a message showing which SSS order has the highest average accuracy, along with it's accuracy
print(f"The least accurate order (least average accuracy) is order {least_accurate_order}, with an average accuracy of {average_accuracy_least}")
        
####################################################################################################

# Use conditionals to determine which SSS order has the highest standard deviation
if max(standard_deviations_list) == standard_deviations_list[0]:
    least_stable_order = 1
    greatest_standard_deviation = standard_deviations_list[0]
    
elif max(standard_deviations_list) == standard_deviations_list[1]:
    least_stable_order = 2
    greatest_standard_deviation = standard_deviations_list[1]
    
elif max(standard_deviations_list) == standard_deviations_list[2]:
    least_stable_order = 3
    greatest_standard_deviation = standard_deviations_list[2]
    
# Print a message showing which SSS order has the highest average accuracy, along with it's accuracy
print(f"The least stable order (highest standard deviation) is order {least_stable_order}, with a standard deviation of {greatest_standard_deviation}")
    

Iteration 1 

1. SMOTE, split, scale
Accuracy: 0.8131925433450659
2. split, SMOTE, scale
Accuracy: 0.8569892473118279
3. split, scale, SMOTE
Accuracy: 0.8841781874039939

----------------------------------------------------------------------------------------------------

Iteration 2 

1. SMOTE, split, scale
Accuracy: 0.8091513492373875
2. split, SMOTE, scale
Accuracy: 0.8597542242703533
3. split, scale, SMOTE
Accuracy: 0.8841781874039939

----------------------------------------------------------------------------------------------------

Iteration 3 

1. SMOTE, split, scale
Accuracy: 0.8146265154477904
2. split, SMOTE, scale
Accuracy: 0.8563748079877113
3. split, scale, SMOTE
Accuracy: 0.8811059907834101

----------------------------------------------------------------------------------------------------

The most accurate order (highest average accuracy) is order 3, with an average accuracy of 0.8831541218637993
The most stable order (lowest standard deviation) is order 3, with a st

#### Each time you run the loop above, you could get different results.  We notice that even though SSS order 3 often yields the highest average accuracy, it can also yield the least stability (highest standard deviation).   We also notice that even though SSS order 1 sometimes yields the most stability (lowest standard deviation), it can also yield the least average accuracy.  We therefore opt to employ the SSS order 2, or "split, SMOTE, scale." This will sometimes yield the highest stability (lowest standard deviation.)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=50)
    
# Use SMOTE to handle class imbalance
smote = SMOTE(sampling_strategy=0.2, k_neighbors=2)
X_train_SMOTE, y_train_SMOTE = smote.fit_sample(X_train, y_train.ravel())
y_train_SMOTE = y_train_SMOTE.reshape(-1,1)

# Create scaler for features
X_SMOTE_scaler = StandardScaler().fit(X_train_SMOTE)

# Scale features
X_train_SMOTE_scaled = X_SMOTE_scaler.transform(X_train_SMOTE)
X_test_scaled = X_SMOTE_scaler.transform(X_test)

# Create, fit, and score the Logistic Regression classifier
# When we tested the Machine Learning model,
# we saw that it gave too much weight to the "age" feature
# No matter what value the other features had,
# it seemed to return "1" if the age were above a certain value,
# and "0" if the age were below a certain value
# We therefore use a very low C value to stop the coefficient
# of the age feature from becoming too large
classifier = LogisticRegression(solver="liblinear", penalty="l2" , C=.00005)
classifier = classifier.fit(X=X_train_SMOTE_scaled, y=y_train_SMOTE)
score = classifier.score(X_test_scaled, y_test)

# Print the accuracy
print(score)

0.9086021505376344


In [8]:
classifier.coef_

array([[ 0.16943796,  0.06752388, -0.05198894,  0.05198894, -0.07246565,
         0.07246565, -0.0394961 ,  0.0394961 ,  0.04547211, -0.02821631,
        -0.01745703, -0.00862261, -0.00256948,  0.02868548]])

In [9]:
# Export out final model and scaler
from sklearn.externals import joblib

standard_scaler = StandardScaler().fit(X_train_SMOTE)

standard_scaler_export_file_path = os.path.join("..", "web_development", "standard_scaler.model")
joblib.dump(standard_scaler, standard_scaler_export_file_path)

classifier_export_file_path = os.path.join("..", "web_development", "stroke_predictor.model")
joblib.dump(classifier, classifier_export_file_path)

['..\\web_development\\stroke_predictor.model']