# Classification Analysis of Patient Parameters on Stroke Predictions

In [None]:
library(tidyverse)
library(repr)
library(dplyr)
library(tidymodels)

install.packages("themis")
library(recipes)
library(modeldata)
library(themis)

options(repr.matrix.max.rows = 6)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘tidymodels’ was built under R version 4.0.2”


## Introduction

## Methods and Results

In [3]:
set.seed(1768)
raw_data_url <- "https://raw.githubusercontent.com/jordanjzhao/dsci-project-proposal/main/data/healthcare-dataset-stroke-data.csv"
stroke_data <- read.csv(raw_data_url, na.strings = c("N/A", na)) %>%
    mutate(stroke = as_factor(stroke))

stroke_data_clean <- na.omit(stroke_data)
head(stroke_data_clean)

# Select data
stroke_data_select <- stroke_data_clean %>%
    select(stroke, bmi, avg_glucose_level)
stroke_data_select

# Create training and testing set
stroke_split <- initial_split(stroke_data_select, prop = 0.75, strata = stroke)
stroke_train <- training(stroke_split)
stroke_test <- testing(stroke_split)

ERROR: Error in read.csv(raw_data_url, na.strings = c("N/A", na)) %>% mutate(stroke = as_factor(stroke)): could not find function "%>%"


In [2]:
# scale and standardize predictors
# Due to imbalance in count, more non stroke than stroke cases, must account for this
#stroke_recipe <- recipe(stroke ~ bmi + avg_glucose_level, data = stroke_train) %>%
#    step_scale(all_predictors()) %>%
#    step_center(all_predictors()) %>%
#    step_upsample(stroke, over_ratio = 1, skip = FALSE) %>%
#    prep()

stroke_recipe <- recipe(stroke ~ bmi + avg_glucose_level, data = stroke_train) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors()) %>%
    step_upsample(stroke, over_ratio = 1, skip = FALSE) %>%
    prep()
stroke_recipe

upsampled_stroke <- bake(stroke_recipe, stroke_train)

upsampled_stroke %>%
    group_by(stroke) %>%
    summarize(n = n())


# 5-fold cross-valiation on training set
stroke_vfold <- vfold_cv(stroke_train, v = 5, strata = stroke)


# KNN classifier
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("classification")

# kvals
k_vals <- tibble(neighbors = seq(from = 1, to = 100, by = 5))

# Create workflow analysis with recipe and model specs 
stroke_knn_results <- workflow() %>%
    add_recipe(stroke_recipe) %>%
    add_model(knn_tune) %>%
    tune_grid(resamples = stroke_vfold, grid = k_vals) %>%
    collect_metrics()
stroke_knn_results

# plot k vs accuracy 
stroke_knn_accuracies <- stroke_knn_results %>%
    filter(.metric == "accuracy")
stroke_knn_accuracies

# plot
cross_val_plot <- stroke_knn_accuracies %>%
    ggplot(aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "Neighbors", y = "Accuracy Estimate") +
    theme(text = element_text(size = 12))
cross_val_plot


ERROR: Error in recipe(stroke ~ bmi + avg_glucose_level, data = stroke_train) %>% : could not find function "%>%"


## Discussion

## Works Cited

Boehme, Amelia K., et al. “Stroke Risk Factors, Genetics, and Prevention.” Circulation Research, 3 Feb. 2017, https://www.ahajournals.org/doi/full/10.1161/CIRCRESAHA.116.308398. 

Hägg-Holmberg, Stefanie, et al. “The Role of Blood Pressure in Risk of Ischemic and Hemorrhagic Stroke in Type 1 Diabetes - Cardiovascular Diabetology.” BioMed Central, BioMed Central, 9 July 2019, https://cardiab.biomedcentral.com/articles/10.1186/s12933-019-0891-4. 

Dataset source: https://www.kaggle.com/fedesoriano/stroke-prediction-dataset
Author credentials: fedesoriano