In [1]:
#Loaded the appropriate packages for analysis
library(tidyverse)
library(tidymodels)
library(dplyr)
library(purrr)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.0     [32m✔[39m [34mrsample     [39m 1.0.0
[32m✔[39m [34mdials       [39m 1.0.0     [32m✔[39m [34mtune        [39m 1.0.0
[32m✔[39m [34minfer       [39m 1.0.2     [32m✔[39m [34mworkflows   [39m 1.0.0
[32m✔

In [2]:
#Read the data, and added colummn names/headings
cleveland<-read.delim("processed.cleveland.data", header=FALSE, sep=",")
cleveland<- rename(cleveland, 
                   age= V1,
                   sex= V2,
                   cp= V3,
                   trestbp= V4,
                   chol= V5,
                   fbs= V6,
                   restecg= V7,
                thalach= V8,
                   exang = V9,
                   oldpeak = V10,
                   slope =V11,
                   ca = V12,
                   thal = V13,
                   num= V14) 
#Converted the num column(which tells us the severity and if the patient has heart disease) to a factor
#note that 0 means no heart disease, and 1-4 mean increasing levels of heart disease
cleveland|> mutate(num=as_factor(num))

age,sex,cp,trestbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<fct>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0,0
62,0,4,140,268,0,2,160,0,3.6,3,2.0,3.0,3
57,0,4,120,354,0,0,163,1,0.6,1,0.0,3.0,0
63,1,4,130,254,0,2,147,0,1.4,2,1.0,7.0,2
53,1,4,140,203,1,2,155,1,3.1,3,0.0,7.0,1


In [3]:
#selected only the columns that we will be using for our analysis

cleveland<-cleveland|> select(age, trestbp, chol, fbs, num)|>mutate(num=as_factor(num))|> filter(num=="0"|num=="4")
cleveland

age,trestbp,chol,fbs,num
<dbl>,<dbl>,<dbl>,<dbl>,<fct>
63,145,233,1,0
37,130,250,0,0
41,130,204,0,0
56,120,236,0,0
57,120,354,0,0
57,140,192,0,0
56,140,294,0,0
44,120,263,0,0
52,172,199,1,0
57,150,168,0,0


In [4]:
#setting the seed
set.seed(1)
#created training(75%) and testing data 
cleveland_split<- initial_split(cleveland, prop=0.75, strata= num)
cleveland_train<- training(cleveland_split)
cleveland_test<- testing(cleveland_split)

In [5]:
cleveland_train

Unnamed: 0_level_0,age,trestbp,chol,fbs,num
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
1,63,145,233,1,0
2,37,130,250,0,0
7,56,140,294,0,0
10,57,150,168,0,0
13,49,130,266,0,0
14,64,110,211,0,0
15,58,150,283,1,0
16,60,130,206,0,4
17,50,120,219,0,0
19,66,150,226,0,0


In [6]:
#table containing the average values of all predictors of each severity of heart disease
average_predictors<-cleveland_train|> group_by(num)|>
summarize(across(age:fbs, mean))
average_predictors

num,age,trestbp,chol,fbs
<fct>,<dbl>,<dbl>,<dbl>,<dbl>
0,52.27642,129.0081,240.1789,0.1382114
4,60.77778,142.0,232.8889,0.1111111


In [8]:
#graphs showing trends in predictor variables segregated by each class of heart disease severity. Note that each y-axis variable is the average value for each class of num 

cleveland_agenum_trend<- cleveland_train|> ggplot(aes(x=num, y=age)+geom_bar(stat="identity")+labs(x="Severity of Heart Disease", y=" Average Age")+ ggtitle ("Training Data: Trend of Average Age versus Heart Disease Severity")
cleveland_agenum_trend

cleveland_trestbpnum_trend<- cleveland_train|> ggplot(aes(x=num, y=trestbp))+geom_bar(stat="identity")+ labs(x="Severity of Heart Disease", y="Average Resting Heart Rate")+ ggtitle ("Training Data: Trend of Average Resting Heart Rate versus Heart Disease Severity")
cleveland_trestbpnum_trend

cleveland_cholnum_trend<- cleveland_train|> ggplot(aes(x=num, y=chol))+geom_bar(stat="identity")+ labs(x="Severity of Heart Disease", y="Average Serum Cholesterol Levels (mg/dl)")+ ggtitle ("Training Data: Trend of Average Serum Cholesterol Levels versus Heart Disease Severity")
cleveland_cholnum_trend

cleveland_fbsnum_trend<- cleveland_train|> ggplot(aes(x=num, y=fbs))+geom_bar(stat="identity")+ labs(x="Severity of Heart Disease", y="Average Fasting Blood Sugar above or below 120mg/dl")+ ggtitle ("Training Data: Trend of Average Fasting Blood Sugar versus Heart Disease Severity")
cleveland_fbsnum_trend

#From these plots, we can see that there are changes in the predictor variable values for each class of the num variable 


ERROR: Error in parse(text = x, srcfile = src): <text>:4:1: unexpected symbol
3: cleveland_agenum_trend<- cleveland_train|> ggplot(aes(x=num, y=age)+geom_bar(stat="identity")+labs(x="Severity of Heart Disease", y=" Average Age")+ ggtitle ("Training Data: Trend of Average A
4: cleveland_agenum_trend
   ^
