In [None]:
# Install necessary packages if not already installed
if (!requireNamespace("googleCloudStorageR", quietly = TRUE)) {
  install.packages("googleCloudStorageR")
}

# Load the required packages
library(googleCloudStorageR)

# Set your Google Cloud Storage bucket name
bucket_name <- Sys.getenv("WORKSPACE_BUCKET")

# Specify the file path within the bucket
name_of_file_in_bucket <- "data/final_dataset.csv"

# Define the local destination path
local_destination_path <- "final_dataset.csv"

# Authenticate with Google Cloud Storage
#gcs_auth(path = "/path/to/your/keyfile.json")  # Replace with the path to your JSON keyfile

# Copy CSV file from the bucket to the local working directory
gcs_get_object(object = name_of_file_in_bucket, bucket = bucket_name)

In [None]:
library(tidyverse)
colitis_df <- read_csv('final_dataset.csv')

In [None]:
colitis_df %>% head()

In [None]:
####dropping drug use columns with huge missingness

columns_to_drop = c(
    'Past 3 Month Use Frequency: Street Opioid 3 Month Use',
    'Past 3 Month Use Frequency: Sedative 3 Month Use',
    'Past 3 Month Use Frequency: Prescription Stimulant 3 Month Use',
    'Past 3 Month Use Frequency: Prescription Opioid 3 Month Use',
    'Past 3 Month Use Frequency: Other Stimulant 3 Month Use',
    'Past 3 Month Use Frequency: Other 3 Month Use',
    'Past 3 Month Use Frequency: Inhalant 3 Month Use',
    'Past 3 Month Use Frequency: Hallucinogen 3 Month Use',
    'Past 3 Month Use Frequency: Cocaine 3 Month Use'
)

colitis_df <- colitis_df%>% select(-one_of(columns_to_drop))
colitis_df %>% head()

In [None]:
colnames(colitis_df)

colitis_df %>% group_by(as.factor(`Including yourself, who in your family has had Crohn\'s disease? Select all that apply.`)) %>% summarize(n = n())

In [None]:
####reconstructing existing features for analysis:
#colitis_df$uc1_crohns2 <- as.factor(as.character(uc1_crohns2))
colitis_df$doctor_listen <- colitis_df$`How often does a doctor or nurse act as if he or she thinks you are not smart when you go to a doctor\'s office or other health care provider?`
colitis_df$doctor_not_smart <- colitis_df$`How often does a doctor or nurse act as if he or she thinks you are not smart when you go to a doctor\'s office or other health care provider?`
family_history_cols <- c('family_crohns', 'family_colitis', 'family_cancer', 'family_polyps')
colitis_recode <- colitis_df %>% mutate(gender = case_when(
                gender == 'Female' ~ 'Female',
                gender == 'Male' ~ 'Male',
                str_starts(gender, 'Gender Identity') ~ 'LGBTQ+',
                TRUE ~ 'NA'), race = case_when(
                race %in% c('Asian', 'Middle Eastern or North African', 'More than one population' ) ~ 'Other',
                race == 'White' ~ 'White',
                race == 'Black or African American' ~ 'Black',
                TRUE ~ 'NA'), 
                alcohol_excess = case_when(
                `Alcohol: Average Daily Drink Count` %in% c('5 or 6', '7 to 9', '10 or More') ~ '5+',
                `Alcohol: Average Daily Drink Count` %in% c('1 or 2', '3 or 4') ~ '<5',
                TRUE ~ 'NA'),
                doctor_listen = case_when(
                doctor_listen == 'Never' ~ 1,
                doctor_listen == 'Rarely' ~ 2,
                doctor_listen == 'Sometimes' ~ 3,
                doctor_listen == 'Most of the time' ~ 4,
                doctor_listen == 'Always' ~ 5),
                doctor_not_smart = case_when(
                doctor_not_smart == 'Never' ~ 1,
                doctor_not_smart == 'Rarely' ~ 2,
                doctor_not_smart == 'Sometimes' ~ 3,
                doctor_not_smart == 'Most of the time' ~ 4,
                doctor_not_smart == 'Always' ~ 5),
                daily_mar_use = case_when(
                `Past 3 Month Use Frequency: Marijuana 3 Month Use` == 'Daily' ~ 1,
                `Past 3 Month Use Frequency: Marijuana 3 Month Use` %in% c('Monthly', 'One Or Twice', 'Weekly', 'Never') ~ 0),
                smoker_yes = case_when(
                `Smoking: Smoke Frequency` %in% c('Every Day', 'Some Days') ~ 1,
                `Smoking: Smoke Frequency` %in% c('Not At All') ~ 0),
                mental_health = case_when(
                `Overall Health: General Mental Health` == 'Poor' ~ 1,
                `Overall Health: General Mental Health` == 'Fair' ~ 2,
                `Overall Health: General Mental Health`== 'Good' ~ 3,
                `Overall Health: General Mental Health` == 'Very Good' ~ 4,
                `Overall Health: General Mental Health` %in% c('Excllent', 'Excellent') ~ 5,
                ),
                physical_health = case_when(
                `Overall Health: General Physical Health` == 'Poor' ~ 1,
                `Overall Health: General Physical Health` == 'Fair' ~ 2,
                `Overall Health: General Physical Health`== 'Good' ~ 3,
                `Overall Health: General Physical Health` == 'Very Good' ~ 4,
                `Overall Health: General Physical Health` %in% c('Excllent', 'Excellent') ~ 5,
                ),
                social_health = case_when(
                `Overall Health: Social Satisfaction` == 'Poor' ~ 1,
                `Overall Health: Social Satisfaction` == 'Fair' ~ 2,
                `Overall Health: Social Satisfaction`== 'Good' ~ 3,
                `Overall Health: Social Satisfaction` == 'Very Good' ~ 4,
                `Overall Health: Social Satisfaction` %in% c('Excllent', 'Excellent') ~ 5,
                ),
                family_crohns = case_when(
                str_detect(`Including yourself, who in your family has had Crohn\'s disease? Select all that apply.`, 'Including yourself') ~ 'Yes',
                          is.na(`Including yourself, who in your family has had Crohn\'s disease? Select all that apply.`) ~ 'No'),
                family_cancer = case_when(
                str_detect(`Including yourself, who in your family has had colon cancer/rectal cancer? Select all that apply.`, 'Including yourself') ~ 'Yes',
                          is.na(`Including yourself, who in your family has had colon cancer/rectal cancer? Select all that apply.`) ~ 'No'),
                family_polyps = case_when(
                str_detect(`Including yourself, who in your family has had colon polyps? Select all that apply.`, 'Including yourself') ~ 'Yes',
                          is.na(`Including yourself, who in your family has had colon polyps? Select all that apply.`) ~ 'No'),
                family_colitis = case_when(
                str_detect(`Including yourself, who in your family has had ulcerative colitis? Select all that apply.`, 'Including yourself') ~ 'Yes',
                          is.na(`Including yourself, who in your family has had ulcerative colitis? Select all that apply.`) ~ 'No')
  )
                

colitis_recode$cancer_yes <- as.factor(colitis_df$cancer_yes)
#colitis_recode %>% group_by(family_any, cancer_yes) %>% summarize(n = n())

In [None]:
colnames(colitis_recode)

In [None]:
colnames(colitis_df)

colitis_recode %>% group_by(race) %>% summarize(n = n())
colitis_recode %>% group_by(uc1_crohns2) %>% summarize(n = n())
#colitis_recode %>% group_by(race) %>% summarize(n = n())
mean(colitis_recode$age_at_colitis_dx)
sd(colitis_recode$age_at_colitis_dx)
mean(colitis_recode$median_income)
sd(colitis_recode$median_income)

In [None]:
analysis_columns <- c('age_at_colitis_dx','uc1_crohns2','cancer_yes','gender','race','median_income','no_health_insurance','deprivation_index',
                     'Can\'t afford full medical care','Delayed medical care','alcohol_excess', 'smoker_yes','mental_health','physical_health','social_health','family_crohns','family_cancer','family_polyps','family_colitis' )


colitis_recode <- colitis_recode[, names(colitis_recode) %in% analysis_columns]
colitis_recode$uc1_crohns2 <- as.factor(colitis_recode$uc1_crohns2)
colitis_recode$smoker_yes <- as.factor(as.character(colitis_recode$smoker_yes))
colitis_recode$family_crohns <- as.factor(colitis_recode$family_crohns)
colitis_recode$family_cancer <- as.factor(colitis_recode$family_cancer)
colitis_recode$family_colitis <- as.factor(colitis_recode$family_colitis)
colitis_recode$family_polyps <- as.factor(colitis_recode$family_polyps)
colitis_recode$mental_health <- as.numeric(colitis_recode$mental_health)
colitis_recode$physical_health <- as.numeric(colitis_recode$physical_health)
colitis_recode$social_health <- as.numeric(colitis_recode$social_health)



column_types <- sapply(colitis_recode, function(col) {
  if (is.factor(col) || is.character(col)) {
    return("categorical")
  } else if (is.numeric(col)) {
    return("numeric")
  } else {
    return("other")
  }
})

# Iterate over columns and create tables
for (col_name in names(colitis_recode)) {
  col_type <- column_types[col_name]
  
  if (col_type == "categorical") {
    # For categorical columns (factors or character), create tables
    table_result <- table(colitis_recode[[col_name]],colitis_recode$cancer_yes,  useNA = 'always')
    chi_table_result <- table(colitis_recode[[col_name]], colitis_recode$cancer_yes)
    prop_table_result <- prop.table(table_result, margin = 2)*100
    print(paste("Table for", col_name))
    print(table_result)
    #print(table(colitis_recode[[col_name]],  useNA = 'always'))
    print(prop.table(table_result))
    print(chisq.test(chi_table_result))
  } else if (col_type == "numeric") {
    # For numeric columns, create summary statistics
    summary_result <- tapply(colitis_recode[[col_name]], colitis_recode$cancer_yes, summary, na.rm = TRUE)
    print(paste("Summary for", col_name))
    print(summary_result)
    print(sd(colitis_recode[[col_name]]))
    print(t.test(colitis_recode[[col_name]]~ colitis_recode$cancer_yes))
  } else {
    print(paste("Skipping", col_name, "as it is neither categorical nor numeric."))
  }
}

In [None]:
colitis_model <- colitis_recode
colitis_model$gender <- factor(colitis_recode$gender, levels = c('Male', 'Female', 'LGBTQ+'))
colitis_model$race <- factor(colitis_recode$race, levels = c('White', 'Black', 'Other'))
#summary(colitis_recode$gender)
#summary(colitis_model$gender)
model <- glm(data = colitis_model, family = binomial, formula = as.numeric(as.character(cancer_yes)) ~ age_at_colitis_dx + uc1_crohns2 + gender + race + physical_health + smoker_yes+median_income+ 
            no_health_insurance)

summary(model)
exp(coef(model))



In [None]:
#install.packages('ggeffects')
#install.packages('effects')
#install.packages('patchwork')
library(ggeffects)
plot(ggpredict(model, 'age_at_colitis_dx'))

library(patchwork)

library(ggeffects)
plot(ggpredict(model, 'smoker_yes'))

library(patchwork)

#plts = lapply(names(coefficients(model))[-1],function(i){
#       return(plot(ggpredict(model,i)))
#       })

#wrap_plots(plts)

In [None]:
#####graphing for significant features
library(ggplot2)
library(cowplot)
ggplot(colitis_recode, aes(cancer_yes, age_at_colitis_dx)) + geom_boxplot() +geom_point() +  theme_cowplot()

ggplot(colitis_recode, aes(cancer_yes, deprivation_index)) + geom_boxplot() +geom_point() +  theme_cowplot()

ggplot(colitis_recode, aes(cancer_yes, median_income)) + geom_boxplot() +geom_point() +  theme_cowplot()

ggplot(colitis_recode, aes(cancer_yes, no_health_insurance)) + geom_boxplot() +geom_point() +  theme_cowplot()

ggplot(colitis_recode, aes(cancer_yes, poverty)) + geom_boxplot() +geom_point() +  theme_cowplot()

In [None]:
####deprivation graph
dep_summary <- colitis_recode %>% group_by(cancer_yes) %>% summarise(sd = sd(deprivation_index, na.rm=TRUE), deprivation_index = mean(deprivation_index))

t.test(colitis_recode$deprivation_index ~ colitis_recode$cancer_yes, na.rm = TRUE)

ggplot(colitis_recode, aes(cancer_yes, deprivation_index)) + geom_violin(col = 'darkgray', trim = FALSE) + geom_pointrange(aes(ymin = deprivation_index-sd, ymax = deprivation_index+sd),data = dep_summary, col = 'red') + theme_cowplot()

In [None]:
####nohealthinsurance
health_summary <- colitis_recode %>% group_by(cancer_yes) %>% summarise(sd = sd(no_health_insurance, na.rm=TRUE), no_health_insurance = mean(no_health_insurance))

t.test(colitis_recode$no_health_insurance ~ colitis_recode$cancer_yes, na.rm = TRUE)

ggplot(colitis_recode, aes(cancer_yes, no_health_insurance)) + geom_violin(col = 'darkgray', trim = FALSE) + geom_pointrange(aes(ymin = no_health_insurance-sd, ymax = no_health_insurance+sd),data = health_summary, col = 'red') + theme_cowplot()

In [None]:
####physical health
health_summary <- colitis_recode %>% group_by(cancer_yes) %>% summarise(sd = sd(physical_health), physical_health = mean(physical_health))

t.test(colitis_recode$physical_health ~ colitis_recode$cancer_yes, na.rm = TRUE)

ggplot(colitis_recode, aes(cancer_yes, physical_health)) + geom_violin(col = 'darkgray', trim = FALSE) + geom_pointrange(aes(ymin = physical_health-sd, ymax = physical_health+sd),data = health_summary, col = 'red') + theme_cowplot()