Heart Diease Report 

In [100]:
library(tidyverse) 
library(tidymodels)
library(gridExtra)
options(repr.matrix.max.rows = 6)
url <-"https://raw.githubusercontent.com/katelynmilan/DSCI_project/main/processed.cleveland.data"
data_set <- read_csv(url, col_names = c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"))
data_set

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): ca, thal
[32mdbl[39m (12): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,3
57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0,1
38,1,3,138,175,0,0,173,0,0.0,1,?,3.0,0


In [103]:
clean_data <- data_set|> 
select(age, sex, chol, num) |> 
rename(Heart_Disease = num, Cholesterol = chol, Age = age, Sex = sex)|>
mutate(Heart_Disease = as.character(Heart_Disease))   

data_split <- initial_split(clean_data, prop = 0.75, strata = Heart_Disease)
data_train <- training(data_split)
data_test <- testing(data_split)

data_train 

Age,Sex,Cholesterol,Heart_Disease
<dbl>,<dbl>,<dbl>,<chr>
63,1,233,0
37,1,250,0
41,0,204,0
⋮,⋮,⋮,⋮
54,1,286,3
55,0,205,3
59,1,176,3


In [104]:
set.seed(42069)

clean_data <- data_set|> 
select(age, chol, num) |> 
rename(heart_disease = num, cholesterol = chol, age = age)|>
mutate(heart_disease = as.factor(heart_disease))   

data_split <- initial_split(clean_data, prop = 0.75, strata = heart_disease)
data_train <- training(data_split)
data_test <- testing(data_split)

heart_recipe <- recipe(heart_disease ~ age + cholesterol, data = clean_data)|>
                    step_scale(all_predictors())|>
                    step_center(all_predictors())

heart_model <- nearest_neighbor(weight_func = "rectangular", neighbors = tune())|>
            set_engine("kknn")|>
            set_mode("classification")

heart_workflow <- workflow()|>
                    add_model(heart_model)|>
                    add_recipe(heart_recipe)

heart_vfold <- vfold_cv(data_train, v = 5, strata = heart_disease)

grid_vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

best_k <- heart_workflow |>
            tune_grid(resamples = heart_vfold, grid = grid_vals)|>
            collect_metrics()|>
            filter(.metric == "accuracy")|>
            slice_min(mean, n = 1)

heart_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 2)|>
            set_engine("kknn")|>
            set_mode("classification")

knn_fit <- workflow()|>
            add_model(heart_spec)|>
            add_recipe(heart_recipe)|>
            fit(data = data_train)

heart_pred <- knn_fit |> 
           predict(data_test) |>
           bind_cols(data_test)
           

heart_pred

.pred_class,age,cholesterol,heart_disease
<fct>,<dbl>,<dbl>,<fct>
0,41,204,0
1,62,268,3
0,57,354,0
⋮,⋮,⋮,⋮
0,63,187,2
0,68,193,2
3,38,175,0
