## Group proposal
Title: Heart Disease Diagnostic Tool

In [41]:
library(rvest)
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

Introduction: 

In [42]:
set.seed(293)

dataset_test <- download.file(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", 
    "heart_disease.csv")

heart_data <- read_csv("heart_disease.csv", col_names = FALSE)


heart_data <- rename(heart_data, 
                     age =X1,
                     sex =X2,
                     cp =X3,
                     trestbps =X4,
                     chol = X5,
                     fbs =X6,
                     restecg=X7,
                     thalach=X8,
                     exang=X9,
                     oldpeak=X10,
                     slope=X11,
                     ca=X12,
                     thal=X13,
                     diagnosis=X14)

idx <- heart_data == "?"
is.na(heart_data) <- idx                                 #eliminates ? and replaces with NA

heart_data[] <- sapply(heart_data, as.numeric) #converts all the data to numeric

names = c('sex', 'cp', 'exang', 'slope', 'thal', 'diagnosis') #unsure if ca is a factor
heart_data[,names] <- lapply(heart_data[,names] , factor)

heart_data <- mutate(heart_data, diagnosis = ifelse(diagnosis == "1"|diagnosis == "2"|diagnosis == "3"|diagnosis == "4", "Positive", "Negative"))
heart_data <- mutate(heart_data, diagnosis = as.factor(diagnosis))          #changes the diagnosis values to strings

heart_data



heart_split <- initial_split(heart_data, prop = 0.75, strata = diagnosis)          #splits data
heart_train <- training(heart_split)
heart_test <- testing(heart_split) 

heart_train

train_summary <- heart_train |>
    summarize(across(age:diagnosis, mean, na.rm=TRUE))
train_summary

count_train_summary <- group_by(heart_train, diagnosis) |>
    summarize(count=n())
count_train_summary

table(heart_train$diagnosis, useNA = "ifany")                 # potential way to get the count of each class 
# heart_table = matrix(c(1:70), ncol = 14, byrow=TRUE)
# colnames(heart_table) = c("age","sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", 
#                           "exang", "oldpeak", "slope", "ca", "thal", "diagnosis")
# rownames(heart_table) <- c("mean", "sd", "observations", "min", "max")

# heart_table








# heart_test_graph <- heart_train |>
#     ggplot(aes(x = diagnosis, y = thalach)) + #color = diagnosis)) +
#     geom_point()

# heart_test_graph

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): X12, X13
[32mdbl[39m (12): X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X14

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,diagnosis
<dbl>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<fct>,<dbl>,<fct>,<fct>
63,1,1,145,233,1,2,150,0,2.3,3,0,6,Negative
67,1,4,160,286,0,2,108,1,1.5,2,3,3,Positive
67,1,4,120,229,0,2,129,1,2.6,2,2,7,Positive
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
57,1,4,130,131,0,0,115,1,1.2,2,1,7,Positive
57,0,2,130,236,0,2,174,0,0.0,2,1,3,Positive
38,1,3,138,175,0,0,173,0,0.0,1,,3,Negative


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,diagnosis
<dbl>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<fct>,<dbl>,<fct>,<fct>
63,1,1,145,233,1,2,150,0,2.3,3,0,6,Negative
37,1,3,130,250,0,0,187,0,3.5,3,0,3,Negative
41,0,2,130,204,0,2,172,0,1.4,1,0,3,Negative
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
68,1,4,144,193,1,0,141,0,3.4,2,2,7,Positive
57,1,4,130,131,0,0,115,1,1.2,2,1,7,Positive
57,0,2,130,236,0,2,174,0,0.0,2,1,3,Positive


“argument is not numeric or logical: returning NA”
“argument is not numeric or logical: returning NA”
“argument is not numeric or logical: returning NA”
“argument is not numeric or logical: returning NA”
“argument is not numeric or logical: returning NA”
“argument is not numeric or logical: returning NA”


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,diagnosis
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
54.00441,,,130.0617,243.511,0.1365639,0.9779736,149.8767,,1.032159,,0.7098214,,


diagnosis,count
<fct>,<int>
Negative,123
Positive,104



Negative Positive 
     123      104 