In [None]:
knitr::opts_chunk$set(echo = TRUE)

### The selector function

In [None]:

lgbm_selector <- function(constituent_df,
                          selector = "shap",
                          conc_scalars = NULL
                          
                          ) {
  
    
  

      model_stats_lgbm <- list()
      
      var_imp <- list()
      
      vars <- list()
      
      all_model_stats <- list()
      
      shap_values <- list()
      
      shap_value_summary <- list()
      
      model_returns <- list()
      
      predicted_observed_ts <- list()
      
      
      ### Declare the predictors
      
        
      predictors <- constituent_df
  
      
      for(i in 1:ncol(predictors)) {
        
        
        
          run <- paste0("run", i)
        
                    
          cat(crayon::yellow("\nModeling Run", i, "\n"))


                
                  for(j in c(2003, 2008, 2013, 2018)) {
                
              
                        test_years <- seq(j-4, j, 1)
                        
                        train_years <- seq(1990, j-5, 1)
                        
                        k <- (2018 - j)/5
                  
                        k <- ifelse(k == 0, 1, k+1)
                        
                        
                        
                        cat(crayon::cyan("\nTraining ", first(train_years), ":",
                                           last(train_years),
                                           "\n")) 
                        cat(crayon::magenta("\nTesting ", first(test_years), ":",
                                           last(test_years),
                                           "\n")) 
                        
                            
                        
                        #### Training data
                            predictor_dataset_train <- predictors %>%
                            filter(wateryear %in% train_years)
                            
                            if(nrow(predictor_dataset_train) < 1) next
                     
                          ### Now subset the testing data to that same subset
                          predictor_dataset_test <- predictors %>%
                            filter(wateryear %in% test_years) %>%
                            dplyr::select(colnames(predictor_dataset_train))
                          
                          #######################
                        
                          ### Declare the predictor and response variables 
                        preds <- data.matrix(predictor_dataset_train %>%
                                                    dplyr::select(!c(log_conc,
                                                                     wateryear,
                                                                     date, 
                                                                     tributary)))
                        
                        response <- predictor_dataset_train$log_conc
                        
                        ### Set up the environment - 
                        #### this is just preparing the dataset API to be used by lightgbm. 
                        #### This is our training data
                        train_lgbm <- lgb.Dataset(preds, 
                                                         label = response,

                                                         ) 
                        
                        ### Declare the test data
                        test_lgbm <- data.matrix(predictor_dataset_test %>%
                                                              dplyr::select(!c(log_conc,
                                                                     wateryear,
                                                                     date, 
                                                                     tributary)))
                    

                        
                        #########
                    
                      ### Declare the hyperparameters 
                      ### These are just default for now
                      
                      hyperparams <- list(objective = "regression",
                                          num_leaves = 31L,
                                          learning_rate = 0.1,
                                          min_data_in_leaf = 20L,
                                          num_threads = 10L)

                        
                        ### Train the model
                        
                        set.seed(913)
                        
                        nutrient_model_lgbm <- lgb.train(hyperparams,
                                                          data = train_lgbm,
                                                          verbose = 1L,
                                                          nrounds = 100L
                                                         )
                        
                        ### Get model fits on training data
                        nutrient_fits <- predict(nutrient_model_lgbm, 
                                                       data = preds) %>%
                          as_tibble() %>% rename(log_predicted_conc = 1)
                        
                        
                        ### Predict with the model on test data
                        nutrient_predicted <- predict(nutrient_model_lgbm, 
                                                       data = test_lgbm) %>%
                          as_tibble() %>% rename(log_predicted_conc = 1)
                        
                        ### Calculate the SHAP values
                        shap_values[[k]] <- SHAPforxgboost::shap.prep(xgb_model = 
                                                                        nutrient_model_lgbm, 
                                                                   X_train = test_lgbm)
                        
                        shap_value_summary[[j]] <- shap_values[[k]] %>%
                              as_tibble() %>%
                              dplyr::group_by(variable) %>%
                              summarise(sd_shap = sd(value),
                                        feature_importance = mean_value[1]) %>%
                              mutate(sd_plus_imp = sd_shap + feature_importance)
                        
                        
                        ### Bind predictions on test data
                        ### to observatios of test data
                        predicted_observed <- bind_cols(predictor_dataset_test %>%
                                                                dplyr::rename(log_observed_conc =
                                                                                log_conc),
                                                                       nutrient_predicted) 
                        
### Now use the smearing coefficient to convert back to non-log
predicted_observed_resc <- predicted_observed %>%
  mutate(observed_conc = 10^(log_observed_conc*conc_scalars$sd + conc_scalars$mean),
         predicted_conc = 10^(log_predicted_conc*conc_scalars$sd + conc_scalars$mean))
                        
                        predicted_observed_ts[[j]] <- predicted_observed_resc %>%
                          dplyr::select(tributary, date, predicted_conc, observed_conc)
                        
                    
                        #### Evaluate
                        #### For each watershed

model_stats_simple <- predicted_observed_resc %>%
  ungroup() %>%
  dplyr::group_by(tributary) %>%
  summarise(mae = hydroGOF::mae(predicted_conc, observed_conc),
            nse = hydroGOF::NSE(predicted_conc, observed_conc),
            kge = hydroGOF::KGE(predicted_conc, 
                                observed_conc),
            pbias = hydroGOF::pbias(predicted_conc,
                                    observed_conc)) %>%
  dplyr::ungroup() 

#### Median across all 

model_stats_lgbm[[j]] <- model_stats_simple %>%
  reframe(across(where(is.numeric),
                 list(median = ~median(.x)),
                 .names = "{.fn}_{.col}"
                 ))

                        
                        var_imp[[j]] <- lgb.importance(nutrient_model_lgbm , 
                                                                         percentage = TRUE)
                        
                        
                  
            #####################################################################
          
          
          }
          
            
            
              all_var_imp <- bind_rows(var_imp) 
              
              all_shap_value_summary <- bind_rows(shap_value_summary) %>%
                dplyr::group_by(variable) %>%
                summarise(mean_sd_plus_imp = mean(sd_plus_imp))
          
          all_model_stats[[i]] <- bind_rows(model_stats_lgbm) %>%
            mutate(model = i)
          
          summary_var_imp <- all_var_imp %>%
            dplyr::group_by(Feature) %>%
            summarise(mean_Gain = mean(Gain)) %>%
            dplyr::ungroup()
          
            if(selector == "shap") {
              
                one_removed_predictors <- all_shap_value_summary %>%
                  dplyr::ungroup() %>%
                  arrange(desc(mean_sd_plus_imp)) %>%
                  dplyr::slice(-nrow(.))
                
          vars[[i]] <- all_shap_value_summary %>%
            ungroup() %>%
            mutate(model = i) %>%
            rename(Feature = variable)
          
          ### See how many we have left
          var_count <- length(one_removed_predictors$variable)
          
          if(var_count == 0) break 
          
          
            ### Update variable list
          predictors <- predictors %>%
            dplyr::select(one_removed_predictors$variable,
                          log_conc,
                          wateryear,
                          date, 
                          tributary)
          
              
            } else if(selector == "gain"){
              
                one_removed_predictors <- summary_var_imp %>%
                  dplyr::ungroup() %>%
                  arrange(desc(mean_Gain)) %>%
                  dplyr::slice(-nrow(.))
                
                          vars[[i]] <- summary_var_imp %>%
            ungroup() %>%
            mutate(model = i)
          
          ### See how many we have left
          var_count <- length(one_removed_predictors$Feature)
          
          if(var_count == 0) break 
          
          
            ### Update variable list
          predictors <- predictors %>%
            dplyr::select(one_removed_predictors$Feature,
                          log_conc,
                          wateryear,
                          date, 
                          tributary)
          
            }
          
      }
          
        
      
      
      all_all_model_stats <- bind_rows(all_model_stats)
      
      all_var_imp <- bind_rows(vars)
      
      
      #### Save outputs
      
 
      
      model_returns[[1]] <- all_var_imp
      
      #model_returns[[4]] <- shap_values
      
      
      ## examine the model summary statistics
      summary_model_stats <- all_all_model_stats %>%
        ungroup()
      
      
      collapsed_models <- all_var_imp %>%
        group_by(model) %>%
        arrange(Feature, .by_group = TRUE) %>%
        summarise(all_vars = paste(Feature, collapse = ",")) %>%
        full_join(., summary_model_stats, 
                  by = "model")
      
        
        model_stats <- collapsed_models %>%
              dplyr::group_by(model, all_vars) %>%
                  summarise(#n = n(),
                            mean_kge = mean(median_kge, na.rm = TRUE),
                                  mean_nse = mean(median_nse, na.rm = TRUE),
                                  mean_mae = mean(median_mae, na.rm = TRUE),
                                  mean_pbias = mean(median_pbias, na.rm = TRUE),
                            sd_kge = sd(median_kge),
                            sd_nse = sd(median_nse),
                            sd_mae = sd(median_mae),
                            sd_pbias = sd(median_pbias),
                            ) 
            
              model_returns[[2]] <- model_stats
              
        
        
      
      
      return(model_returns)

}



## Plot variable selection stats

This function takes a dataframe of model performance stats generated by the 
variable selection process and plots those as a function of  model "number",
where model number 1 has all n variables and model n has 1 variable

In [None]:

plot_stats <- function(model_stats_df) {
  
  pos_mods <- nrow(model_stats_df)
  
  if(pos_mods <25){by=1} else{by=5}
  
  model_stats_df %>%
    dplyr::select(model, 
                  mean_kge, mean_nse, mean_mae, mean_pbias,
                  sd_kge,  sd_nse,  sd_mae, sd_pbias) %>%
    pivot_longer(cols = -model, names_to = c(".value", "metric"), names_sep = "_") %>%
    ggplot() +
      geom_line(aes(x = model, y = mean, 
                    color = metric)
                ) +
      geom_errorbar(aes(x = model, 
                        ymin = mean-sd, ymax = mean + sd,
                        color = metric),
                    alpha = 0.4
                    ) +
      geom_point(aes(x = model, y = mean, color = metric),
                shape = 19, size = 1) +
      scale_color_brewer(palette = "Set1",
                         guide = "none") + 
      scale_x_continuous(breaks = seq(0,pos_mods,by),
                         minor_breaks = seq(0,pos_mods,1)) + 
      labs(y = element_blank(),
           x = "Model Iteration") + 
      theme_few() +
      theme(panel.grid = element_line(color = "gray90"),
            legend.position = "bottom",
            strip.placement = "outside") +
      facet_wrap(~metric, scales = "free", ncol = 1,
                 strip.position = "left")

  
}

### The runner function

In [None]:

lgbm_runner <- function(train_df = NULL, 
                        test_df = NULL,
                        chosen_mod = NULL,
                        is_tuned = FALSE,
                        tuned_params = NULL,
                        conc_scalars = NULL){
  
  
    ########## Model setup #########################################################
    
    ### Declare the predictor and response variables 
    ### Make sure to exclude variables we left in there 
    ### For interpretability
    
    preds <- data.matrix(train_df %>%
                           dplyr::select(chosen_mod$Feature))
                                                                        
                            
    response <- train_df$log_conc
                            
    ### Set up the environment - this is just preparing the dataset API for use by lightgbm.
    #### This is our training data
    train_lgbm <- lgb.Dataset(preds, label = response)
                            
    #### Declare the test data
    test_lgbm <- data.matrix(test_df %>%
                           dplyr::select(chosen_mod$Feature))
                        
    ### Declare the hyperparameters 
                    
    if(is_tuned == FALSE ) { 
      
      hyperparams <- list(objective = "regression",
                          num_leaves = 31L,
                          learning_rate = 0.1,
                          min_data_in_leaf = 20L,
                          num_threads = 10L)
      } else if(is_tuned == TRUE ) {
        
        hyperparams <- list(objective = "regression",
                            num_leaves = tuned_params$num_leaves,
                            min_data_in_leaf = tuned_params$min_n,
                            bagging_fraction = tuned_params$sample_size,
                            bagging_freq = 1,
                            num_iterations = tuned_params$trees,
                            max_depth = tuned_params$tree_depth
                            )
                        }
                        

    
    ################################################################################
    ################################################################################
    
    ### Now, let's do some actual modeling stuff
    
    ### Inform what we are running:
    
    cat(crayon::cyan("Running model with Features:"),
        crayon::green(chosen_mod$Feature),
        crayon::green("\n"))

    
    #### Train the model
                            
    set.seed(913)
                            
    model_lgbm <- lgb.train(hyperparams,
                                   data = train_lgbm,
                                   verbose = 1L,
                                   nrounds = 100L)
                            
    ### Predict with the model on test data
    
    predicted <- predict(model_lgbm, 
                                      data = test_lgbm) %>%
      as_tibble() %>% 
      rename(log_predicted_conc = 1)
                            
                            
    ### Bind predictions on test data to observations
    
    pred_obs <- bind_cols(test_df %>% 
                                      dplyr::rename(log_observed_conc = log_conc),
                                    predicted) 
    
    ### Now use the scalars to convert back to linear scale
    
    rescale_pred_obs <- pred_obs %>%
      mutate(observed_conc = 10^(log_observed_conc*conc_scalars$sd + conc_scalars$mean),
             predicted_conc = 10^(log_predicted_conc*conc_scalars$sd + conc_scalars_test$mean))
    
                        
    ### Evaluate - we are going to use multiple error metrics
    
    #### For each watershed
    
    model_stats_each <- rescale_pred_obs %>%
      ungroup() %>%
      dplyr::group_by(tributary) %>%
      summarise(mae = hydroGOF::mae(predicted_conc, observed_conc),
                nse = hydroGOF::NSE(predicted_conc, observed_conc),
                kge = hydroGOF::KGE(predicted_conc, 
                                    observed_conc),
                pbias = hydroGOF::pbias(predicted_conc,
                                        observed_conc)) %>%
      dplyr::ungroup() 
    
    #### Median across all 
    
    model_stats_summary <- model_stats_each %>%
      reframe(across(where(is.numeric),
                     list(median = ~median(.x)),
                     .names = "{.fn}_{.col}"
                     ))
    
    
    ##### Return stuff
    
    test_stats <- list()
    
    test_stats[[1]] <- model_stats_each
    
    test_stats[[2]] <- model_stats_summary
    
    test_stats[[3]] <- rescale_pred_obs %>%
      dplyr::select(tributary, date, observed_conc, predicted_conc)
    
    return(test_stats)

  
  
}