# DSCI 100 Group 05 Project - Heart Disease

## Introduction

### Heart disease describes a range of conditions that affect the heart.


In [25]:
## Load libraries:
library(tidyverse)
library(readr)
library(dplyr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

# Set seed
set.seed(1)

## Load Heart Disease Data (name the object heart_disease)
cleveland_data <- read_delim("data/cleve.mod", delim = "\t", skip = 19, col_names = FALSE)

cleveland_data

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


X1
<chr>
63.0 male angina 145.0 233.0 true hyp 150.0 fal 2.3 down 0.0 fix buff H
67.0 male asympt 160.0 286.0 fal hyp 108.0 true 1.5 flat 3.0 norm sick S2
67.0 male asympt 120.0 229.0 fal hyp 129.0 true 2.6 flat 2.0 rev sick S1
⋮
49.0 male notang 118.0 149.0 fal hyp 126.0 fal 0.8 up 3.0 norm sick S1
74.0 fem abnang 120.0 269.0 fal hyp 121.0 true 0.2 up 1.0 norm buff H
54.0 fem notang 160.0 201.0 fal norm 163.0 fal 0.0 up 1.0 norm buff H


In [29]:
## Clean and wrangle data 

cleveland_clean <- cleveland_data |> 
    separate(X1, into = c("age", "sex", "cp", "trestbps", "chol","fbs", 
                   "restecg", "thalach", "exang", "oldpeak", "slope", 
                   "ca", "thal", "num"), sep = "\\s+")

## Set columns to correct type
cleveland_clean <- mutate_at(cleveland_clean, 
                             c("age", "trestbps", "chol", "thalach", "oldpeak", "ca", "num"),
                            as.numeric) |>
    mutate(cp = as_factor(cp)) |>
    select(age, cp, trestbps, chol, thalach)

cleveland_clean

## Scaling the data
cleveland_scaled <- cleveland_clean |>
    mutate(scaled_age = scale(age, center = TRUE),
          scaled_trestbps = scale(trestbps, center = TRUE),
          scaled_chol = scale(chol, center = TRUE),
          scaled_thalach = scale(thalach, center = TRUE))

## Splitting the data into training and testing

cleveland_split <- initial_split(cleveland_clean, prop = 0.75, strata = cp)
cleveland_train <- training(cleveland_split)
cleveland_test <- testing(cleveland_split)

    


“[1m[22mExpected 14 pieces. Additional pieces discarded in 303 rows [1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].”
[1m[22m[36mℹ[39m In argument: `ca = .Primitive("as.double")(ca)`.
[33m![39m NAs introduced by coercion


age,cp,trestbps,chol,thalach
<dbl>,<fct>,<dbl>,<dbl>,<dbl>
63,angina,145,233,150
67,asympt,160,286,108
67,asympt,120,229,129
⋮,⋮,⋮,⋮,⋮
49,notang,118,149,126
74,abnang,120,269,121
54,notang,160,201,163


In [None]:
## 