# DSCI 100 Group 05 Project - Heart Disease

## Introduction

### Heart disease describes a range of conditions that affect the heart.


In [1]:
## Load libraries:
library(tidyverse)
library(readr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

# Set seed
set.seed(1)

## Load Heart Disease Data (name the object heart_disease)
cleveland_data <- read_delim("data/cleve.mod", delim = "\t", skip = 19, col_names = FALSE)


── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.5     [32m✔[39m [34mrsample     [39

In [6]:
## Clean and wrangle data 

cleveland_clean <- cleveland_data |> 
    separate(X1, into = c("age", "sex", "cp", "trestbps", "chol","fbs", 
                   "restecg", "thalach", "exang", "oldpeak", "slope", 
                   "ca", "thal", "num"), sep = "\\s+")

## Set columns to correct type
cleveland_clean <- mutate_at(cleveland_clean, 
                             c("age", "trestbps", "chol", "thalach", "oldpeak", "ca", "num"),
                            as.numeric) |>
    mutate(cp = as_factor(exang)) |>
    select(exang, age, trestbps, chol, thalach)

cleveland_clean

## Scaling the data
cleveland_scaled <- cleveland_clean |>
    mutate(scaled_age = scale(age, center = TRUE),
          scaled_trestbps = scale(trestbps, center = TRUE),
          scaled_chol = scale(chol, center = TRUE),
          scaled_thalach = scale(thalach, center = TRUE))

## Splitting the data into training and testing

cleveland_split <- initial_split(cleveland_clean, prop = 0.75, strata = exang)
cleveland_train <- training(cleveland_split)
cleveland_test <- testing(cleveland_split)

    


“[1m[22mExpected 14 pieces. Additional pieces discarded in 303 rows [1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].”
[1m[22m[36mℹ[39m In argument: `ca = .Primitive("as.double")(ca)`.
[33m![39m NAs introduced by coercion


exang,age,trestbps,chol,thalach
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fal,63,145,233,150
true,67,160,286,108
true,67,120,229,129
⋮,⋮,⋮,⋮,⋮
fal,49,118,149,126
true,74,120,269,121
fal,54,160,201,163


In [7]:
## Group and Summarize Data to see number of occurences of different chest pain types

num_cp <- cleveland_train |>
    group_by(exang) |>
    summarize(count = n(),
             avg_restbps = mean(trestbps),
             avg_age = mean(age),
             avg_chol = mean(chol),
             avg_thalach = mean(thalach))

num_cp

exang,count,avg_restbps,avg_age,avg_chol,avg_thalach
<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>
fal,153,131.0327,53.33987,246.0784,156.9739
true,74,132.5946,55.40541,247.0946,137.6622


In [None]:
## Create plots to visualize data


