In [2]:
# Assignment: FINAL PROJECT
# Name: Kouba, Michelle
# Date: 2022-11-06

## Load the ggplot2 package
library(ggplot2)
library(caTools)
library(tidyverse)
library(sjmisc)
theme_set(theme_minimal())

## Set the working directory to the root of your DSC 520 directory
setwd("C:/Users/kouba/Desktop/GitHub/dsc520/completed/Final Project")

## Load the 2020 BRFSS data
brfss_df <- read.csv("brfss2020.csv")
health_df <- select(brfss_df, DIABETE4, GENHLTH, CHECKUP1, X_TOTINDA, SLEPTIM1, X_RFBING5, X_SMOKER3, EMPLOY1, X_BMI5CAT, SEXVAR, X_IMPRACE, INCOME2, ADDEPEV3, MARITAL)

## Cleaning/Recoding variables for analyses
## Recoding health status into Excellent versus all other
health_df$health<-rec(health_df$GENHLTH, rec = "1=1; 2,3,4,5=0")
## Diabetes 1
health_df$married<-rec(health_df$MARITAL, rec = "1=1; 2,3,4,5,6=0")
## Diabetes 1
health_df$diabetes<-rec(health_df$DIABETE4, rec = "1,2,4=1; 3=0")
## Depression 1
health_df$depression<-rec(health_df$ADDEPEV3, rec = "1=1; 2=0")
## Routine Checkup in Past Year 1
health_df$checkup<-rec(health_df$CHECKUP1, rec = "1=1; 2,3,4,8=0")
## Leisure Time Physical Activity in Past Month 1
health_df$leisure_pa<-rec(health_df$X_TOTINDA, rec = "1=1; 2=0")
##  Binge drinking in past month, 4 drinks for women, 5 for men
health_df$binge<-rec(health_df$X_RFBING5, rec = "2=1; 1=0")
## Regular smoking either every day or some days 1
health_df$smoker<-rec(health_df$X_SMOKER3, rec = "1,2=1; 3,4=2")
## Employed or self-employed 1
health_df$employed<-rec(health_df$EMPLOY1, rec = "1,2=1; 3,4,5,6,7,8=2")
## Overweight or obese 1
health_df$overweight<-rec(health_df$X_BMI5CAT, rec = "3,4=1; 1,2=2")
## Gender, male is 1
health_df$gender<-ifelse(health_df$SEXVAR==1,1,0)
## NH White is 1
health_df$caucasian<-ifelse(health_df$X_IMPRACE==1,1,0)
## Income, 50K or more is 1
health_df$income<-rec(health_df$INCOME2, rec = "8=1; 1,2,3,4,5,6,7=2")
## Slept 8 hours or more daily 1
## Removing 77 and 99 which are RF or DK/Missing
health_df$sleep<-rec(health_df$SLEPTIM1, rec = "8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24=1; 1,2,3,4,5,6,7,77,99=0")
frq(health_df$health)
##Split the Data into Training and Test Samples
split <- sample.split(health_df, SplitRatio = 0.8)
train <- subset(health_df, split == "TRUE")
test <- subset(health_df, split == "FALSE")

##Munge the Data
health_df$health <- as.factor(health_df$health)

##Fit a multiple regression model with all identifiable MODIFIABLE health predictors
mymodel <- glm(health ~ diabetes + checkup + leisure_pa + depression
               + smoker + employed + overweight
               + income + sleep,
               data = train, family = 'binomial')
summary(mymodel)

## The variables with the greatest effect on excellent health status appear to be : diabetics and those who aren't overweight.
##  This nonsensical effect could be due to the fact that overweight persons tend to underrespond to whether they are overweight
## or not, many obese persons just state they are overweight or average weight and overweight persons tend to reply that they are
## at a healthy weight.

## Run test data through the model.
res <- predict(mymodel, test, type="response")

res <- predict(mymodel, train, type="response")

## Validate the model - confusion matrix
confmatrix <- table(Actual_Value=train$health, Predicted_Value = res>0.5)
confmatrix

##Accuracy
(confmatrix[[1,1]] + confmatrix [[2,2]])/sum(confmatrix)

##  This model predicts with nearly 80% accuracy but most of the accuracy is due to
## correctly guessing those with less than excellent health as so.  This model doesn't
## do a good job for assessing excellent health which was the goal of the project (it only
## correctly guessed excellent health 729 of 56201 times).  Most of the lack of accuracy is
## due to pretty much always guessing unhealthy and most of the time, it's a correct guess.


knitr::stitch('C:/Users/kouba/Desktop/GitHub/dsc520/completed/Week 10/Week10A_KoubaMichelle.r')

ERROR: ignored