In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
import os

spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

In [3]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [4]:
# Read in the healthcare-dataset-stroke-data.csv via AWS into Spark DataFrame
from pyspark import SparkFiles
url = "https://project4-06052023.s3.us-east-2.amazonaws.com/healthcare-dataset-stroke-data.csv"
spark.sparkContext.addFile(url)
stroke_data = spark.read.csv(SparkFiles.get("healthcare-dataset-stroke-data.csv"), sep=",", header=True, inferSchema=True)
stroke_data.show()

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
|31112|  Male|80.0|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female|49.0|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female|79.0|           1|            0|         

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# **Preprocessing**

In [5]:
# Print Spark dataframe schema (Note: all schema except 'bmi' inferred correctly)
stroke_data.printSchema

<bound method DataFrame.printSchema of DataFrame[id: int, gender: string, age: double, hypertension: int, heart_disease: int, ever_married: string, work_type: string, Residence_type: string, avg_glucose_level: double, bmi: string, smoking_status: string, stroke: int]>

In [6]:
# Convert Spark dataframe to Pandas df
stroke_data_df = stroke_data.toPandas()

In [7]:
# Drop the non-beneficial ID column.
stroke_df = stroke_data_df.drop(columns={'id'})
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int32  
 3   heart_disease      5110 non-null   int32  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   object 
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int32  
dtypes: float64(2), int32(3), object(6)
memory usage: 379.4+ KB


In [8]:
# Convert 'bmi' to float (Note: 'coerce' converts 'N/A' values to NaN)
stroke_df['bmi'] = pd.to_numeric(stroke_df['bmi'], errors ='coerce')
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int32  
 3   heart_disease      5110 non-null   int32  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int32  
dtypes: float64(3), int32(3), object(5)
memory usage: 379.4+ KB


In [9]:
# Drop rows containing NaN
stroke_df = stroke_df.dropna()
stroke_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [10]:
# Convert categorical data to numeric with `pd.get_dummies`
encoded_stroke_data = pd.get_dummies(stroke_df)
encoded_stroke_data

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,1,0,0,1,...,0,0,0,1,1,0,1,0,0,0
5106,81.0,0,0,125.20,40.0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
5107,35.0,0,0,82.99,30.6,0,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.6,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0


In [11]:
# Split our preprocessed data into our features and target arrays
y = encoded_stroke_data["stroke"]
X = encoded_stroke_data.drop(["stroke"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
# Check the balance of our target values
y.value_counts()

0    4700
1     209
Name: stroke, dtype: int64

In [13]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# **Round 1: Initial Testing**
Preliminary testing of a variety of machine learning models used for predicting outcomes of our labeled categorical data. These first attempts utilize our encoded data with no additional modifications.

### **Logistic Regression Model 1**
Attempt 1 using orignial data.

In [22]:
# Fit the model
# Instantiate the Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=200)

# Fit the model using training data
model = logistic_regression_model.fit(X_train_scaled, y_train)

In [23]:
# Make a prediction using the testing data
LR_pred = logistic_regression_model.predict(X_test_scaled)

In [24]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, LR_pred)

0.5

In [25]:
# Generate a confusion matrix for the model
matrix = confusion_matrix(y_test, LR_pred)
print(matrix)

[[1174    0]
 [  54    0]]


In [26]:
# Print the classification report for the model
report = classification_report(y_test, LR_pred)
print(report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1174
           1       0.00      0.00      0.00        54

    accuracy                           0.96      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.91      0.96      0.93      1228



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Logistic Regression Model**: Due to our unbalanced data set this model was unable to identify our positive targets.

### **Neural Network Model 1**
Attempt 1 using original data.

In [27]:
# Define the model
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 9
hidden_nodes_layer2 = 9
hidden_nodes_layer3 = 9

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 9)                 198       
                                                                 
 dense_1 (Dense)             (None, 9)                 90        
                                                                 
 dense_2 (Dense)             (None, 9)                 90        
                                                                 
 dense_3 (Dense)             (None, 1)                 10        
                                                                 
Total params: 388
Trainable params: 388
Non-trainable params: 0
_________________________________________________________________


In [28]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [29]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [30]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Loss: 0.18544407188892365, Accuracy: 0.9560260772705078


In [31]:
# Export our model to HDF5 file
# nn.save('Models/stroke_model_NN_1.h5')

In [32]:
# Compare predictions to actual outcomes
y_values = nn.predict(X_test_scaled)
y_predict = y_values.round()

df = pd.DataFrame(list(zip(y_test, y_predict)), columns =['true_value', 'predictions'])
df = df.round()
df



Unnamed: 0,true_value,predictions
0,0,[0.0]
1,0,[0.0]
2,0,[0.0]
3,0,[0.0]
4,0,[0.0]
...,...,...
1223,0,[0.0]
1224,0,[0.0]
1225,0,[0.0]
1226,0,[0.0]


In [33]:
# Print classification report
nn_report = classification_report(y_test, y_predict)
print(nn_report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1174
           1       0.00      0.00      0.00        54

    accuracy                           0.96      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.91      0.96      0.93      1228



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Neural Network Model:** This model failed to recognize any of our stroke patients.

### **K Nearest Neighbors 1**
Attempt 1 using original data

In [34]:
# Instantiate the model with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

In [35]:
# Train the model
model.fit(X_train_scaled, y_train)

In [36]:
# Create predictions
KN_pred = model.predict(X_test_scaled)

In [37]:
# Print confusion matrix
confusion_matrix(KN_pred,y_test)

array([[1164,   53],
       [  10,    1]])

In [38]:
# Print classification report
print(classification_report(KN_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97      1217
           1       0.02      0.09      0.03        11

    accuracy                           0.95      1228
   macro avg       0.51      0.52      0.50      1228
weighted avg       0.98      0.95      0.97      1228



**K Nearest Neighbors:** This model also struggled to recognize our positive targets due to our unbalanced dataset, however it did attempt to classify a handful of data points as positive and correctly identified 4 unlike the logistic regression model.

### **Random Forest 1**
Attempt 1 using orignial data

In [39]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500)

In [40]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [41]:
# Making predictions using the testing data
rf_pred = rf_model.predict(X_test_scaled)

In [42]:
# Print confusion matrix
confusion_matrix(rf_pred,y_test)

array([[1172,   54],
       [   2,    0]])

In [43]:
# Print classification report
print(classification_report(rf_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      1226
           1       0.00      0.00      0.00         2

    accuracy                           0.95      1228
   macro avg       0.50      0.48      0.49      1228
weighted avg       1.00      0.95      0.98      1228



**Random Forest:** This model also struggled due to the unbalanced data, performing slighly worse than the K Nearest Neighbors model.

# **Optimization**

## **Round 2: Resampling**
In our intial round of testing we found our unbalanced dataset, which contained only `4.26%` positive cases, was insufficient to train an accurate machine learning model. The second round of testing will utilize the `RandomOverSampler` from `imblearn` correct for this imbalance.


### **Resample Data with RandomOverSampler**

In [14]:
!pip install imblearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [15]:
# Instantiate the random oversampler model
ros = RandomOverSampler()

# Fit the original training data to the random_oversampler model
X_R, y_R = ros.fit_resample(X_train, y_train)

In [16]:
# Count the distinct values of the resampled labels data
y_R.value_counts()

1    3526
0    3526
Name: stroke, dtype: int64

In [17]:
X_train_scaled_R = X_scaler.transform(X_R)

### **Neural Network Model 2**
Attempt 2 using RandomOverSampler and the same number of neurons/layers as the initial model.

In [44]:
# Neural network model with RandomOverSampler
# Define the model
number_input_features = len(X_train_scaled_R[0])
hidden_nodes_layer1 = 9
hidden_nodes_layer2 = 9
hidden_nodes_layer3 = 9

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 9)                 198       
                                                                 
 dense_5 (Dense)             (None, 9)                 90        
                                                                 
 dense_6 (Dense)             (None, 9)                 90        
                                                                 
 dense_7 (Dense)             (None, 1)                 10        
                                                                 
Total params: 388
Trainable params: 388
Non-trainable params: 0
_________________________________________________________________


In [45]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [46]:
# Train the model
fit_model = nn.fit(X_train_scaled_R,y_R,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [47]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.4179 - accuracy: 0.8347 - 201ms/epoch - 5ms/step
Loss: 0.4179333746433258, Accuracy: 0.8346905708312988


In [48]:
# Compare predictions to actual outcomes
y_values2 = nn.predict(X_test_scaled)
y_predict2 = y_values.round()



In [49]:
# Print classification report
nn_report = classification_report(y_test, y_predict2)
print(nn_report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1174
           1       0.00      0.00      0.00        54

    accuracy                           0.96      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.91      0.96      0.93      1228



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
# Export our model to HDF5 file
# nn.save('Models/stroke_model_NN_2.h5')

**Neural Network Model 2 with RandomOverSampler:** While the overall accuracy score of the model went down, we can infer that the model is no longer classifying every point as `0` or no stroke which is an improvement on the initial model.

### **Logistic Regression Model 2**
Attempt 2 using RandomOverSampler

In [18]:
# Logistic Regression model with RandomOverSampler
# Instantiate the Logistic Regression model
ros_logistic_regression_model = LogisticRegression(max_iter=200)

# Fit the model using the resampled training data
ros_model = ros_logistic_regression_model.fit(X_train_scaled_R, y_R)

# Make a prediction using the testing data
ros_LR_pred = ros_logistic_regression_model.predict(X_test_scaled)

In [19]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, ros_LR_pred)

0.7547321597577135

In [20]:
# Generate a confusion matrix for the model
ros_matrix = confusion_matrix(y_test, ros_LR_pred)
print(ros_matrix)

[[859 315]
 [ 12  42]]


In [21]:
# Print the classification report for the model
ros_report = classification_report(y_test, ros_LR_pred)
print(ros_report)

              precision    recall  f1-score   support

           0       0.99      0.73      0.84      1174
           1       0.12      0.78      0.20        54

    accuracy                           0.73      1228
   macro avg       0.55      0.75      0.52      1228
weighted avg       0.95      0.73      0.81      1228



**Logistic Regression Model 2 with RandomOverSampler:** Previously this model classified all data points as `0`. By oversampling our data the model is now attempting to differentiate between the `0`s and `1`s, with `76%` of the stoke values identified. correctly.

### **Neural Network Model 3**
Attempt 3 using Keras Tuner and RandomOverSampler

In [None]:
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.5 kt-legacy-1.0.5


In [None]:
import keras_tuner as kt

In [None]:
# Set value for input_dim
number_input_features = len(X_train_scaled_R[0])

# Create method for new Sequential model with hyperparameter option
def create_model (hp):
  nn_model = tf.keras.models.Sequential()

  # Activation function options
  activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

  # Number of neurons in first layer
  nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units', min_value=1, max_value=10, step=2),
                                     activation=activation, input_dim=number_input_features))
  
  # Number of hidden layers and neurons per layer
  for i in range(hp.Int('num_layers', 1, 3)):
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i), min_value=1, max_value=10, step=2), activation=activation))

  nn_model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

  # Compile the model
  nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

  return nn_model


In [None]:
# Create tuner with keras_tuner
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [None]:
# Run tuner search for best hyperparameters
tuner.search(X_train_scaled_R,y_R,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 14s]
val_accuracy: 0.7557003498077393

Best val_accuracy So Far: 0.9600977301597595
Total elapsed time: 00h 08m 15s


In [None]:
# Find most accurate model
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'sigmoid',
 'first_units': 7,
 'num_layers': 3,
 'units_0': 7,
 'tuner/epochs': 3,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 2,
 'tuner/round': 0,
 'units_1': 1,
 'units_2': 1}

In [None]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.6237 - accuracy: 0.9601 - 467ms/epoch - 12ms/step
Loss: 0.6237127780914307, Accuracy: 0.9600977301597595


In [None]:
# Compare predictions to actual outcomes
y_values3 = nn.predict(X_test_scaled)
y_predict3 = y_values3.round()



In [None]:
# Print classification report
nn_report = classification_report(y_test, y_predict3)
print(nn_report)

              precision    recall  f1-score   support

           0       0.97      0.80      0.88      1179
           1       0.09      0.47      0.15        49

    accuracy                           0.79      1228
   macro avg       0.53      0.64      0.52      1228
weighted avg       0.94      0.79      0.85      1228



In [None]:
# Export our model to HDF5 file
# nn.save('Models/stroke_model_NN_3.h5')

**Neural Network Model with Keras Tuner and RandomOverSampling:** While the accuracy score shows that our model successfully classifies `96%` of the data, much of this is caused skew in our dataset. The resampling and keras tuner did improve the model's ability to identify stroke patients, but the classification report shows that this model is still less accurate than the logistic regression model.

### **K Nearest Neighbors 2**
Attempt 2 using RandomOverSampler

In [51]:
# Instantiate the model with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

In [52]:
# Train the model
model.fit(X_train_scaled_R, y_R)

In [53]:
# Create predictions
ros_KN_pred = model.predict(X_test_scaled)

In [54]:
# Print confusion matrix
confusion_matrix(ros_KN_pred,y_test)

array([[1091,   50],
       [  83,    4]])

In [55]:
# Print classification report
print(classification_report(ros_KN_pred,y_test))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94      1141
           1       0.07      0.05      0.06        87

    accuracy                           0.89      1228
   macro avg       0.50      0.50      0.50      1228
weighted avg       0.87      0.89      0.88      1228



**K Nearest Neighbors with RandomOverSampler:** The oversampling did little to improve this model's classification of true positive outcomes, while increasing the false positives.

### **Random Forest 2**
Attempt 2 using RandomOverSampler

In [56]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500)

In [57]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled_R, y_R)

In [58]:
# Making predictions using the testing data
ros_rf_pred = rf_model.predict(X_test_scaled)

In [59]:
# Print confusion matrix
confusion_matrix(ros_rf_pred,y_test)

array([[1162,   54],
       [  12,    0]])

In [60]:
# Print classification report
print(classification_report(ros_rf_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97      1216
           1       0.00      0.00      0.00        12

    accuracy                           0.95      1228
   macro avg       0.49      0.48      0.49      1228
weighted avg       0.98      0.95      0.96      1228



**Random Forest 2 with RandomOverSampler: ** The RandomOverSampler did little to improve this model's ability to classify positive outcomes, similiar to K Nearest Neighbors.