In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
import os

spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
0% [Connecting to archive.ubuntu.com (91.189.91.39)] [Waiting for headers] [1 I0% [Connecting to archive.ubuntu.com (91.189.91.39)] [Waiting for headers] [Wai                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
                                                                               0% [Waiting for headers] [Waiting for headers] [Waiting for headers]                                                                    Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers]                                                                    Get:4 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InR

In [4]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [5]:
# Read in the healthcare-dataset-stroke-data.csv via AWS into Spark DataFrame
from pyspark import SparkFiles
url = "https://project4-06052023.s3.us-east-2.amazonaws.com/healthcare-dataset-stroke-data.csv"
spark.sparkContext.addFile(url)
stroke_data = spark.read.csv(SparkFiles.get("healthcare-dataset-stroke-data.csv"), sep=",", header=True, inferSchema=True)
stroke_data.show()

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
|31112|  Male|80.0|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female|49.0|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female|79.0|           1|            0|         

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Preprocessing**

In [6]:
# Print Spark dataframe schema (Note: all schema except 'bmi' inferred correctly)
stroke_data.printSchema

<bound method DataFrame.printSchema of DataFrame[id: int, gender: string, age: double, hypertension: int, heart_disease: int, ever_married: string, work_type: string, Residence_type: string, avg_glucose_level: double, bmi: string, smoking_status: string, stroke: int]>

In [16]:
# Convert Spark dataframe to Pandas df
stroke_data_df = stroke_data.toPandas()

In [17]:
# Drop the non-beneficial ID column.
stroke_df = stroke_data_df.drop(columns={'id'})
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int32  
 3   heart_disease      5110 non-null   int32  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   object 
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int32  
dtypes: float64(2), int32(3), object(6)
memory usage: 379.4+ KB


In [18]:
# Convert 'bmi' to float (Note: 'coerce' converts 'N/A' values to NaN)
stroke_df['bmi'] = pd.to_numeric(stroke_df['bmi'], errors ='coerce')
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int32  
 3   heart_disease      5110 non-null   int32  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int32  
dtypes: float64(3), int32(3), object(5)
memory usage: 379.4+ KB


In [19]:
# Drop rows containing NaN
stroke_df = stroke_df.dropna()
stroke_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [20]:
# Convert categorical data to numeric with `pd.get_dummies`
encoded_stroke_data = pd.get_dummies(stroke_df)
encoded_stroke_data

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,1,0,0,1,...,0,0,0,1,1,0,1,0,0,0
5106,81.0,0,0,125.20,40.0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
5107,35.0,0,0,82.99,30.6,0,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.6,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0


In [21]:
# Split our preprocessed data into our features and target arrays
y = encoded_stroke_data["stroke"]
X = encoded_stroke_data.drop(["stroke"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
# Check the balance of our target values
y.value_counts()

0    4700
1     209
Name: stroke, dtype: int64

In [23]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# **Round 1: Initial Testing**
Preliminary testing of a variety of machine learning models used for predicting outcomes of our labeled categorical data. These first attempts utilize our encoded data with no additional modifications.

### **Logistic Regression Model 1**
Attempt 1 using orignial data.

In [24]:
# Fit the model
# Instantiate the Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=200)

# Fit the model using training data
model = logistic_regression_model.fit(X_train_scaled, y_train)

In [25]:
# Make a prediction using the testing data
LR_pred = logistic_regression_model.predict(X_test_scaled)

In [26]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, LR_pred)

0.5

In [27]:
# Generate a confusion matrix for the model
matrix = confusion_matrix(y_test, LR_pred)
print(matrix)

[[1179    0]
 [  49    0]]


In [28]:
# Print the classification report for the model
report = classification_report(y_test, LR_pred)
print(report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1179
           1       0.00      0.00      0.00        49

    accuracy                           0.96      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.92      0.96      0.94      1228



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Logistic Regression Model**: Due to our unbalanced data set this model was unable to identify our positive targets.

### **Neural Network Model 1**
Attempt 1 using original data.

In [29]:
# Define the model
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 9
hidden_nodes_layer2 = 9
hidden_nodes_layer3 = 9

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 9)                 198       
                                                                 
 dense_1 (Dense)             (None, 9)                 90        
                                                                 
 dense_2 (Dense)             (None, 9)                 90        
                                                                 
 dense_3 (Dense)             (None, 1)                 10        
                                                                 
Total params: 388
Trainable params: 388
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.1654 - accuracy: 0.9585 - 254ms/epoch - 7ms/step
Loss: 0.16544944047927856, Accuracy: 0.958469033241272


In [None]:
# Export our model to HDF5 file
# nn.save('Models/stroke_model_NN_1.h5')

### **K Nearest Neighbors 1**
Attempt 1 using original data

In [None]:
# Instantiate the model with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

In [None]:
# Train the model
model.fit(X_train_scaled, y_train)

In [None]:
# Create predictions
KN_pred = model.predict(X_test_scaled)

In [None]:
# Print confusion matrix
confusion_matrix(KN_pred,y_test)

array([[1168,   47],
       [   9,    4]])

In [None]:
# Print classification report
print(classification_report(KN_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98      1215
           1       0.08      0.31      0.12        13

    accuracy                           0.95      1228
   macro avg       0.54      0.63      0.55      1228
weighted avg       0.98      0.95      0.97      1228



**K Nearest Neighbors:** This model also struggled to recognize our positive targets due to our unbalanced dataset, however it did attempt to classify a handful of data points as positive and correctly identified 4 unlike the logistic regression model.

### **Random Forest 1**
Attempt 1 using orignial data

In [35]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500)

In [36]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [37]:
# Making predictions using the testing data
rf_pred = rf_model.predict(X_test_scaled)

In [38]:
# Print confusion matrix
confusion_matrix(rf_pred,y_test)

array([[1177,   48],
       [   2,    1]])

In [39]:
# Print classification report
print(classification_report(rf_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      1225
           1       0.02      0.33      0.04         3

    accuracy                           0.96      1228
   macro avg       0.51      0.65      0.51      1228
weighted avg       1.00      0.96      0.98      1228



**Random Forest:** This model also struggled due to the unbalanced data, performing slighly worse than the K Nearest Neighbors model.

# **Optimization**

## **Round 2: Resampling**
In our intial round of testing we found our unbalanced dataset, which contained only `4.26%` positive cases, was insufficient to train an accurate machine learning model. The second round of testing will utilize the `RandomOverSampler` from `imblearn` correct for this imbalance.


### **Resample Data with RandomOverSampler**

In [42]:
!pip install imblearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [43]:
# Instantiate the random oversampler model
ros = RandomOverSampler()

# Fit the original training data to the random_oversampler model
X_R, y_R = ros.fit_resample(X_train, y_train)

In [44]:
# Count the distinct values of the resampled labels data
y_R.value_counts()

0    3521
1    3521
Name: stroke, dtype: int64

In [45]:
X_train_scaled_R = X_scaler.transform(X_R)

### **Neural Network Model 2**
Attempt 2 using RandomOverSampler and the same number of neurons/layers as the initial model.

In [None]:
# Neural network model with RandomOverSampler
# Define the model
number_input_features = len(X_train_scaled_R[0])
hidden_nodes_layer1 = 9
hidden_nodes_layer2 = 9
hidden_nodes_layer3 = 9

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 9)                 198       
                                                                 
 dense_5 (Dense)             (None, 9)                 90        
                                                                 
 dense_6 (Dense)             (None, 9)                 90        
                                                                 
 dense_7 (Dense)             (None, 1)                 10        
                                                                 
Total params: 388
Trainable params: 388
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled_R,y_R,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.4671 - accuracy: 0.8143 - 338ms/epoch - 9ms/step
Loss: 0.4670688509941101, Accuracy: 0.8143322467803955


In [None]:
# Export our model to HDF5 file
# nn.save('Models/stroke_model_NN_2.h5')

**Neural Network Model 2 with RandomOverSampler:** While the overall accuracy score of the model went down, we can infer that the model is no longer classifying every point as `0` or no stroke which is an improvement on the initial model.

### **Logistic Regression Model**
Attempt 2 using RandomOverSampler

In [None]:
# Logistic Regression model with RandomOverSampler
# Instantiate the Logistic Regression model
ros_logistic_regression_model = LogisticRegression(max_iter=200)

# Fit the model using the resampled training data
ros_model = logistic_regression_model.fit(X_train_scaled_R, y_R)

# Make a prediction using the testing data
ros_LR_pred = logistic_regression_model.predict(X_test_scaled)

In [None]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, ros_LR_pred)

0.7595831875655954

In [None]:
# Generate a confusion matrix for the model
ros_matrix = confusion_matrix(y_test, ros_LR_pred)
print(ros_matrix)

[[888 289]
 [ 12  39]]


In [None]:
# Print the classification report for the model
ros_report = classification_report(y_test, ros_LR_pred)
print(ros_report)

              precision    recall  f1-score   support

           0       0.99      0.75      0.86      1177
           1       0.12      0.76      0.21        51

    accuracy                           0.75      1228
   macro avg       0.55      0.76      0.53      1228
weighted avg       0.95      0.75      0.83      1228



**Logistic Regression Model 2 with RandomOverSampler:** Previously this model classified all data points as `0`. By oversampling our data the model is now attempting to differentiate between the `0`s and `1`s, with `76%` of the stoke values identified. correctly.

### **Neural Network Model 3**
Attempt 3 using Keras Tuner and RandomOverSampler

In [None]:
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import keras_tuner as kt

In [None]:
# Set value for input_dim
number_input_features = len(X_train_scaled_R[0])

# Create method for new Sequential model with hyperparameter option
def create_model (hp):
  nn_model = tf.keras.models.Sequential()

  # Activation function options
  activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

  # Number of neurons in first layer
  nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units', min_value=1, max_value=10, step=2),
                                     activation=activation, input_dim=number_input_features))
  
  # Number of hidden layers and neurons per layer
  for i in range(hp.Int('num_layers', 1, 3)):
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i), min_value=1, max_value=10, step=2), activation=activation))

  nn_model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

  # Compile the model
  nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

  return nn_model


In [None]:
# Create tuner with keras_tuner
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [None]:
# Run tuner search for best hyperparameters
tuner.search(X_train_scaled_R,y_R,epochs=20,validation_data=(X_test_scaled,y_test))

In [None]:
# Find most accurate model
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'relu',
 'first_units': 1,
 'num_layers': 5,
 'units_0': 9,
 'units_1': 1,
 'units_2': 1,
 'units_3': 13,
 'units_4': 5,
 'units_5': 17,
 'tuner/epochs': 3,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 2,
 'tuner/round': 0}

In [None]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.6922 - accuracy: 0.9585 - 327ms/epoch - 8ms/step
Loss: 0.6921735405921936, Accuracy: 0.958469033241272


## **K Nearest Neighbors 2**
Attempt 2 using RandomOverSampler

In [46]:
# Instantiate the model with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

In [47]:
# Train the model
model.fit(X_train_scaled_R, y_R)

In [48]:
# Create predictions
ros_KN_pred = model.predict(X_test_scaled)

In [49]:
# Print confusion matrix
confusion_matrix(ros_KN_pred,y_test)

array([[1097,   42],
       [  82,    7]])

In [50]:
# Print classification report
print(classification_report(ros_KN_pred,y_test))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95      1139
           1       0.14      0.08      0.10        89

    accuracy                           0.90      1228
   macro avg       0.54      0.52      0.52      1228
weighted avg       0.87      0.90      0.89      1228



**K Nearest Neighbors with RandomOverSampler:** The oversampling did little to improve this model's classification of true positive outcomes, while increasing the false positives.

## **Random Forest 2**
Attempt 2 using RandomOverSampler

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled_R, y_R)

In [None]:
# Making predictions using the testing data
ros_rf_pred = rf_model.predict(X_test_scaled)

In [None]:
# Print confusion matrix
confusion_matrix(ros_rf_pred,y_test)

array([[1166,   49],
       [  11,    2]])

In [None]:
# Print classification report
print(classification_report(ros_rf_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97      1215
           1       0.04      0.15      0.06        13

    accuracy                           0.95      1228
   macro avg       0.51      0.56      0.52      1228
weighted avg       0.98      0.95      0.97      1228



**Random Forest 2 with RandomOverSampler: ** The RandomOverSampler did little to improve this model's ability to classify positive outcomes, similiar to K Nearest Neighbors.