Title 

In [1]:
library(tidyverse) 
library(tidymodels)
library(gridExtra)
options(repr.matrix.max.rows = 6)
url <-"https://raw.githubusercontent.com/katelynmilan/DSCI_project/main/processed.cleveland.data"
data_set <- read_csv(url, col_names = c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"))
data_set

“package ‘ggplot2’ was built under R version 4.3.2”
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.5.0     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom    

age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,3
57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0,1
38,1,3,138,175,0,0,173,0,0.0,1,?,3.0,0


In [98]:
set.seed(1000)

clean_data <- data_set|> 
select(age, chol, num) |> 
rename(heart_disease = num, cholesterol = chol, age = age)|>
mutate(heart_disease = as.factor(heart_disease))   

data_split <- initial_split(clean_data, prop = 0.75, strata = heart_disease)
data_train <- training(data_split)
data_test <- testing(data_split)

heart_recipe <- recipe(heart_disease ~ age + cholesterol, data = clean_data)|>
                    step_scale(all_predictors())|>
                    step_center(all_predictors())

heart_model <- nearest_neighbor(weight_func = "rectangular", neighbors = tune())|>
            set_engine("kknn")|>
            set_mode("classification")

heart_workflow <- workflow()|>
                    add_model(heart_model)|>
                    add_recipe(heart_recipe)

heart_vfold <- vfold_cv(data_train, v = 5, strata = heart_disease)

grid_vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

best_k <- heart_workflow |>
            tune_grid(resamples = heart_vfold, grid = grid_vals)|>
            collect_metrics()|>
            filter(.metric == "accuracy")|>
            slice_min(mean, n = 1)

heart_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 2)|>
            set_engine("kknn")|>
            set_mode("classification")

knn_fit <- workflow()|>
            add_model(heart_spec)|>
            add_recipe(heart_recipe)|>
            fit(data = data_train)

heart_pred <- knn_fit |> 
           predict(data_test) |>
           bind_cols(data_test)
           

heart_pred

.pred_class,age,cholesterol,heart_disease
<fct>,<dbl>,<dbl>,<fct>
0,67,229,1
1,56,236,0
0,52,199,0
⋮,⋮,⋮,⋮
3,44,169,2
0,68,193,2
3,38,175,0
