In [1]:
library(tidyverse)
library(repr)
library(tidymodels)
library(janitor)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.2     [32m✔[39m [34mpurrr  [39m 1.0.1
[32m✔[39m [34mtibble [39m 3.2.1     [32m✔[39m [34mdplyr  [39m 1.1.1
[32m✔[39m [34mtidyr  [39m 1.3.0     [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr  [39m 2.1.3     [32m✔[39m [34mforcats[39m 0.5.2
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.2     [32m✔[39m [34mrsample     [39m 1.1.1
[32m✔[39m [34mdials       [39m 1.1.0     [32m✔[39m [34mtune        [39m 1.0.1
[32m✔[39m [34minfer       [39m 1.0.4     [32m✔[39m [34mworkflows   [39m 1.1.2
[32m✔[39

ERROR: Error in library(janitor): there is no package called ‘janitor’


#  Presence of heart disease of patients in Cleveland, Ohio

 ## Introduction

Heart disease is the leading cause of death in the United States, and one of every five deaths in the United States can be attributed to heart disease (Multiple Cause of Death Data on CDC WONDER, n.d.). This makes the quick and accurate diagnosis of heart disease an extremely important topic of study. In 1989, a probability algorithm was created for the diagnosis of coronary artery disease (Detrano et al., 1989). In their report, they tested their algorithm on the test results of 303 patients from the Cleveland Clinic. We will be trying to predict the presence of heard disease of patients in Cleaveland, Ohio using cholesterol levels and resting blood pressure.

In [2]:
url <- "https://archive.ics.uci.edu/dataset/45/heart+disease"
main_data_column_2 <- read_table(url, col_names = c('Age', 'Sex', 'Chest Pain Type', 'Resting Blood Pressure',
                                                            'Cholesterol', 'Fasting Blood Sugar <120',
                                                        'Resting ECG Reading', 'Max Heart Rate',
                                                       'Exercise Induced Angina (TRUE or FALSE)',
                                                            'Old Peak', 'Slope', 'Number Of Vessels Coloured', 'thal','Health'), skip = 20)

heart_data<-clean_names(main_data_column_2)
heart_data <- heart_data |>
        mutate(health = as_factor(health))
heart_data



[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
cols(
  Age = [31mcol_character()[39m,
  Sex = [31mcol_character()[39m,
  `Chest Pain Type` = [31mcol_character()[39m,
  `Resting Blood Pressure` = [31mcol_character()[39m,
  Cholesterol = [31mcol_character()[39m,
  `Fasting Blood Sugar <120` = [31mcol_character()[39m,
  `Resting ECG Reading` = [31mcol_character()[39m,
  `Max Heart Rate` = [31mcol_character()[39m,
  `Exercise Induced Angina (TRUE or FALSE)` = [31mcol_character()[39m,
  `Old Peak` = [31mcol_character()[39m,
  Slope = [31mcol_character()[39m,
  `Number Of Vessels Coloured` = [31mcol_character()[39m,
  thal = [31mcol_character()[39m,
  Health = [31mcol_character()[39m
)

“312 parsing failures.
row col   expected     actual                                                   file
  1  -- 14 columns 3 columns  'https://archive.ics.uci.edu/dataset/45/heart+disease'
  2  -- 14 columns 3 colu

ERROR: Error in clean_names(main_data_column_2): could not find function "clean_names"


In [None]:

#Heart K nearest neighbors
heart_data<-heart_data|>
    select(sex,resting_blood_pressure,cholesterol,health)
heart_split<-initial_split(heart_data,prop=0.75,strata=health)  
heart_train<-training(heart_split)   
heart_test<-testing(heart_split)

heart_recipe<-recipe(health~cholesterol+resting_blood_pressure,data=heart_train)|>
   step_scale(all_predictors())|>
   step_center(all_predictors())

knn_spec<-nearest_neighbor(weight_func="rectangular",neighbors=12)|>
      set_engine("kknn")|>
      set_mode("classification")

heart_fit<-workflow()|>
      add_recipe(heart_recipe)|>
      add_model(knn_spec)|>
      fit(data=heart_train)
#Heart training data
head(heart_train)

#Heart training data plot
heart_plot<-heart_train|>
    ggplot(aes(x=resting_blood_pressure,y=cholesterol,color=health))+
    geom_point()+
    labs(x="Resting Blood Pressure ",y="Cholesterol", color="Health")+
    theme(text=element_text(size=20))+
    ggtitle("Cholesterol vs Resting Blood Pressure")+
    facet_grid(.~sex)
heart_plot


#Predictions
heart_test_predictions<-predict(heart_fit,heart_test)|>
      bind_cols(heart_test)
head(heart_test_predictions)

#Accuracy
heart_prediction_accuracy<-heart_test_predictions|>
        metrics(truth=health,estimate=.pred_class)   
heart_prediction_accuracy




#Which K to use 
predictionSpec<-nearest_neighbor(weight_func="rectangular",neighbors=tune())|>
                              set_engine("kknn")|>
                              set_mode("classification")

k_vals<-tibble(neighbors=seq(2,20))

yVfold<-vfold_cv(heart_train,v=10,strata = health)

knn_results<-workflow()|>
  add_recipe(heart_recipe)|>
  add_model(predictionSpec)|>
  tune_grid(resamples=yVfold,grid=k_vals)|>
  collect_metrics() 

accuracy<-knn_results|>
  filter(.metric=="accuracy")

cross_val_plot <- accuracy|>
    ggplot(aes(x = neighbors, y = mean))+
        geom_point()+
        geom_line()+
        labs(x="Neighbors", y = "Accuracy Estimate")+ 
        theme(text=element_text(size=20))
#cross_val_plot