0999_functions.Rmd

---
title: "0999_functions"
author: "Puvvula"
date: "2023-08-02"
output: pdf_document
---

#function to create imputed datasets - leftcenslognorm imputation - 10 sets
```{r}
generate_imputed_datasets <- function(...) {
  datasets <- list(...)
  imputed_datasets <- list()

  for (i in seq_along(datasets)) {
    dataset <- datasets[[i]]
    variable_name <- deparse(substitute(dataset))  # Get the name of the dataframe object

    imputed_dataset <- dataset |>
      mice(m = 50, 
           method = c(rep("", 40), "leftcenslognorm"), 
           maxit = 20, 
           seed = 2023,
           lod=c(rep(NA, 40), 0.1)) |>
      clean_names()

    imputed_datasets[[i]] <- imputed_dataset
    assign(paste0("imp_", variable_name), imputed_dataset, envir = .GlobalEnv)
  }

  return(imputed_datasets)
}
```

# Built GEE and pooling using examples from UCLA: 
#https://stats.oarc.ucla.edu/r/faq/how-do-i-perform-multiple-imputation-using-predictive-mean-matching-in-r/

#function to run GEE - using dataframes from MICE miltiple imputation
```{r}
runGeeglm <- function(datasets, outcomes, outputFolder) {
  for (dataset in datasets) {
    # Extract the last part of the input data name
    exposure <- tail(strsplit(dataset, "_")[[1]], 1)
  
    # Load the dataset
    data <- get(dataset)
  
    # Iterate over each outcome
    for (outcome in outcomes) {
      # Perform geeglm analysis
      fit <- with(data, geeglm(get(outcome) ~ log(get(exposure))*visit + 
                                gender + child_race + mat_age + mari_st_p4 + mom_edu_p4_cat +
                                mid_income_p4 + cotinine, 
                              id = participant_id, 
                              family = "gaussian", corstr = "independence"))
    
      # Save the output to the provided folder in RDA format
      outputName <- paste(outputFolder, "/", dataset, "_", outcome, "_", exposure, ".rda", sep = "")
      save(fit, file = outputName)
    }
  }
}
```

#for testing only
```{r}
runGeeglm(datasets = c("imp_dat_dphp","imp_dat_bcep", "imp_dat_bdcipp",  "imp_dat_dnbp"),
                  outcomes = c("ss_std_score_c", "com_raw_c", "coop_raw_c", "assert_raw_c","res_raw_c",
                              "emp_raw_c", "eng_raw_c", "self_raw_c", "pb_std_score_c", "ext_raw_c",
                              "bul_raw_c", "hyp_raw_c", "int_raw_c", "ss_std_score_p", "com_raw_p",
                              "coop_raw_p", "assert_raw_p", "res_raw_p", "emp_raw_p", "eng_raw_p",
                              "self_raw_p", "pb_std_score_p", "ext_raw_p", "bul_raw_p", "hyp_raw_p",
                              "int_raw_p", "as_raw_p"),
                  outputFolder = "~/Documents/ope_ssis/result/gee_mi_res_feb/")
```

#function to extract GEE estimates from rda objects
```{r}
extractGEE <- function(input_folder, output_folder) {
  # Get a list of RDA files in the input folder
  rda_files <- list.files(input_folder, pattern = "\\.rda$", full.names = TRUE)
  
  # Initialize an empty dataframe to store the results
  data <- data.frame()
  
  # Loop through each RDA file
  for (file in rda_files) {
    # Load the RDA file
    load(file)
    
    # Perform operations and extract required rows
    extracted_data <- as_tibble(tidy(pool(fit), conf.int = TRUE)) %>%
      slice(2, 12, 13)
    
    # Get the filename without path or extension
    filename <- str_remove(basename(file), "\\.[^.]+$")
    
    # Trim the filename to remove the "imp_dat_" prefix
    trimmed_filename <- str_remove(filename, "^imp_dat_")
    
    # Split the filename into exposure and outcome variables
    split_strings <- str_split(trimmed_filename, "_")
    exposure <- split_strings[[1]][1]
    outcome <- paste(split_strings[[1]][2:(length(split_strings[[1]]) - 1)], collapse = "_")
    
    # Add the exposure and outcome variables
    extracted_data$exposure <- exposure
    extracted_data$outcome <- outcome
    
    # Bind the extracted data to the existing dataframe
    data <- bind_rows(data, extracted_data)
  }
  
  # Export the dataframe as a CSV to the output folder
  output_file <- file.path(output_folder, "output.csv")
  write_csv(data, output_file)
}
```

#geeglm parallel
```{r}
runGeeglmParallel<- function(datasets, outcomes, covariates, outputFolder) {
  # Set the number of cores to use for parallel processing
  num_cores <- detectCores()
  
  # Create a cluster for parallel processing
  cl <- makeCluster(num_cores)
  
  # Parallelize the outer loop using mclapply
  mclapply(datasets, function(dataset) {
    # Extract the last part of the input data name
    exposure <- tail(strsplit(dataset, "_")[[1]], 1)
  
    # Load the dataset
    data <- get(dataset)
  
    # Iterate over each outcome
    for (outcome in outcomes) {
      # Construct the formula string for geeglm analysis
      formula_str <- paste(outcome, "~ log(get(exposure))*visit +", paste(covariates, collapse = " + "))
      
      # Perform geeglm analysis
      fit <- with(data, geeglm(formula(formula_str),
                               id = participant_id, 
                               family = "gaussian", 
                               corstr = "independence"))
    
      # Save the output to the provided folder in RDA format
      outputName <- paste(outputFolder, "/", dataset, "_", outcome, "_", exposure, ".rda", sep = "")
      save(fit, file = outputName)
    }
  }, mc.cores = num_cores)
  
  # Stop the cluster
  stopCluster(cl)
}
```

#extract output from gee parallel
```{r}
extractGEEParallel<- function(input_folder, output_folder) {
  # Get a list of RDA files in the input folder
  rda_files <- list.files(input_folder, pattern = "\\.rda$", full.names = TRUE)
  
  # Initialize an empty dataframe to store the results
  data <- data.frame()
  
  # Set the number of cores to use for parallel processing
  num_cores <- detectCores()
  
  # Create a cluster for parallel processing
  cl <- makeCluster(num_cores)
  
  # Parallelize the loop using mclapply
  results <- mclapply(rda_files, function(file) {
    # Load the RDA file
    load(file)
    
    # Perform operations and extract required rows
    extracted_data <- as_tibble(tidy(pool(fit), conf.int = TRUE)) |>
      filter(grepl("log\\(get\\(exposure\\)\\)", term)) #results for exposure and exposure-visit interactions
    
    # Get the filename without path or extension
    filename <- str_remove(basename(file), "\\.[^.]+$")
    
    # Trim the filename to remove the "imp_dat_" prefix
    trimmed_filename <- str_remove(filename, "^imp_dat_")
    
    # Split the filename into exposure and outcome variables
    split_strings <- str_split(trimmed_filename, "_")
    exposure <- split_strings[[1]][1]
    outcome <- paste(split_strings[[1]][2:(length(split_strings[[1]]) - 1)], collapse = "_")
    
    # Add the exposure and outcome variables
    extracted_data$exposure <- exposure
    extracted_data$outcome <- outcome
    
    return(extracted_data)
  }, mc.cores = num_cores)
  
  # Stop the cluster
  stopCluster(cl)
  
  # Bind the extracted data from all processes
  data <- bind_rows(results)
  
  # Export the dataframe as a CSV to the output folder
  output_file <- file.path(output_folder, "output.csv")
  write_csv(data, output_file)
}

```


drop NR dur to interference
```{r}
drop_NR_interference<- function(data, last_variable_name) {
  # Identify rows where the last variable is equal to 9999.000
  rows_to_drop <- which(data[[last_variable_name]] == 9999.000)
  
  # Get the participant_id corresponding to these rows
  participant_ids_to_drop <- data$participant_id[rows_to_drop]
  
  # Drop all observations with the identified participant_id
  data <- data[!(data$participant_id %in% participant_ids_to_drop), , drop = FALSE]
  
  # Return the modified data frame
  return(data)
}

```