App v1

kellobri · Nov 6, 2018 · 9591444 · 9591444
commit 9591444
Show file tree

Hide file tree

Showing 8 changed files with 290 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.Rhistory
+.Rproj.user/
diff --git a/app.R b/app.R
@@ -0,0 +1,80 @@
+#
+# Shiny in production course
+#
+
+library(shiny)
+library(shinythemes)
+library(glue)
+library(lime)
+library(billboarder)
+
+pred <- readRDS("lime_prediction_results.RDS")
+
+ui <- fluidPage(
+    theme = shinytheme("simplex"),
+    titlePanel("RStudio Conf 2019 - Shiny in Production Workshop"),
+
+    sidebarLayout(
+        sidebarPanel(
+            selectInput("student", "Student Lookup:", choices = application_data$ID),
+            htmlOutput("risk"),
+            hr(),
+            htmlOutput("major"),
+            htmlOutput("minor"),
+            hr(),
+            HTML('<center><img src="rstudio.png"></center>'),
+            HTML('<center><p>Shiny in Production Workshop 2019</p></center>')
+        ),
+
+        # Show a plot of the generated distribution
+        mainPanel(
+            tabsetPanel(
+                tabPanel("LIME Feature Plot", billboarderOutput("limeStudent")),
+                tabPanel("Cachable Plot", HTML('<center><p>TBD</p></center>')),
+                tabPanel("Data Drill Down", HTML('<center><p>TBD</p></center>'))
+            )
+        )
+    )
+)
+
+server <- function(input, output) {
+
+    obs_row <- reactive({
+        obs_num <- application_data %>% filter(ID == !!input$student)
+        obs_num$ID
+    })
+
+    output$risk <- renderText({
+        risk_pred <- application_data %>% filter(ID == !!input$student)
+        if (risk_pred$predict == 'No'){
+            amt <- 'Low Risk'
+        } else { amt <- 'Elevated Risk'}
+        glue('<h4 style="text-align:center; font-weight:bold; color:#7b8a8b;">Assessment: {amt}</h4>') %>% HTML
+    })
+
+    output$major <- renderText({
+        glue('<h4 style="font-weight:bold;">Major: </h4><p>{application_data[obs_row(),]$major}</p>') %>% HTML
+    })
+
+    output$minor <- renderText({
+        glue('<h4 style="font-weight:bold;">Minor: </h4><p>{application_data[obs_row(),]$minor}</p>') %>% HTML
+    })
+
+    output$limeStudent <- renderBillboarder({
+        #plot_features(pred[[obs_row()]])
+        prediction_data <- pred[[obs_row()]] %>% select(feature_desc, feature_weight, risk_predictor)
+        billboarder() %>%
+            bb_barchart(
+                data = prediction_data,
+                mapping = bbaes(x = feature_desc, y = feature_weight, group = risk_predictor),
+                rotated = TRUE,
+                stacked = TRUE
+            ) %>%
+            bb_colors_manual('Low Risk' = "#417fe2", 'High Risk' = '#7f1c2e') %>%
+            bb_title(text = glue('Feature Contributions to Student Performance Risk'))
+    })
+
+}
+
+# Run the application 
+shinyApp(ui = ui, server = server)
diff --git a/app1.Rproj b/app1.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/data-prep/fake-data-gen-steps.R b/data-prep/fake-data-gen-steps.R
@@ -0,0 +1,49 @@
+
+
+# Categorical Variables
+
+majors <- c('shiny','rmarkdown','plumber','sparklyr','tidyverse')
+minors <- c('leaflet', 'tidyr', 'rgl', 'htmlwidgets', 'Rcpp', 'keras', 'tibbletime', 
+            'devtools', 'dplyr', 'lubridate', 'stringr', 'reticulate', 'ggplot2', 'carat',
+            'recipes', 'DT', 'httr', 'jsonlite', 'testthat', 'roxygen2', 'readxl', 'packrat',
+            'forcats', 'broom', 'purrr')
+
+#high_lookup <- high_lookup[1:32,]
+#low_lookup <- low_lookup[1:32,]
+
+create_low_risk <- function(x){
+  eval(parse(text=low_lookup[x,2]))
+}
+
+low_vals <- lapply(1:nrow(low_lookup), create_low_risk)
+
+list_low <- replicate(1200, lapply(1:nrow(low_lookup), create_low_risk), simplify=FALSE)
+lf <- lapply(list_low, unlist)
+low_frame <- as.data.frame(lf, stringsAsFactors = F)
+low_frame <- as.data.frame(t(low_frame))
+low_frame$risk <- "No"
+names(low_frame) <- t(low_lookup[,1])
+
+create_high_risk <- function(x){
+  eval(parse(text=high_lookup[x,2]))
+}
+
+high_vals <- lapply(1:nrow(high_lookup), create_high_risk)
+
+list_high <- replicate(300, lapply(1:nrow(high_lookup), create_high_risk), simplify=FALSE)
+hf <- lapply(list_high, unlist)
+high_frame <- as.data.frame(hf, stringsAsFactors = F)
+high_frame <- as.data.frame(t(high_frame))
+high_frame$risk <- "Yes"
+names(high_frame) <- t(high_lookup[,1])
+
+
+rstudio_students <- rbind(low_frame, high_frame) 
+rownames(rstudio_students) <- NULL
+rstudio_students$student_id <- c(1:1500)
+
+write.csv(rstudio_students, "~/Downloads/rstudio-student-data.csv")
+#colnames(rstudio_student_data)[colnames(rstudio_student_data)=="X34"] <- "risk"
+
+
+skim(rstudio_students)
diff --git a/data-prep/model-build-steps.R b/data-prep/model-build-steps.R
@@ -0,0 +1,146 @@
+library(h2o)
+library(tidyverse)
+library(readxl)     
+library(lime)       
+library(recipes)    
+
+# Import Data: rstudio_students.csv
+View(rstudio_student_data)
+
+students_tbl <- rstudio_student_data %>%
+  mutate_if(is.character, as.factor) %>%
+  select(student_id, risk, major, minor, everything())
+
+recipe_cleanup <- students_tbl %>%
+  recipe(formula = risk ~ .) %>%
+  step_rm(student_id) %>%
+  step_zv(all_predictors()) %>%
+  step_center(all_numeric()) %>%
+  step_scale(all_numeric()) %>%
+  prep(data = students_tbl)
+
+
+student_bake <- bake(recipe_cleanup, newdata = students_tbl) 
+
+h2o.init()
+
+rstudio_h2o <- as.h2o(student_bake)
+
+h2o_split <- h2o.splitFrame(rstudio_h2o, ratios = c(0.7, 0.15), seed = 222)
+
+train_h2o <- h2o.assign(h2o_split[[1]], "train" ) # 70%
+valid_h2o <- h2o.assign(h2o_split[[2]], "valid" ) # 15%
+test_h2o  <- h2o.assign(h2o_split[[3]], "test" )  # 15%
+
+y <- "risk"
+x <- setdiff(names(train_h2o), c('risk'))
+
+automl_models_h2o <- h2o.automl(
+  x = x, 
+  y = y,
+  training_frame    = train_h2o,
+  leaderboard_frame  = valid_h2o,
+  max_runtime_secs  = 60
+)
+
+automl_leader <- automl_models_h2o@leader
+
+# Make Predictions on Test Data
+risk_predictions <- h2o.predict(
+  object = automl_leader,
+  newdata = test_h2o)
+
+risk_pred <- as.data.frame(risk_predictions)
+risk_pred <- tibble::rowid_to_column(risk_pred, "ID")
+test_data <- as.data.frame(test_h2o)
+test_data <- tibble::rowid_to_column(test_data, "ID")
+
+application_data <- left_join(risk_pred, test_data)
+
+# LIME explainer
+
+explainer <- lime::lime(
+  application_data[,-c(1:5)],
+  model          = automl_leader, 
+  bin_continuous = FALSE
+)
+
+obs <- 1 # This is the observation (employee position in test data set) to explain 
+set.seed(222)
+
+explanation <- lime::explain(
+  x              = application_data[obs,-c(1:5)], 
+  explainer      = explainer, 
+  n_labels       = 1, 
+  n_features     = 6,
+  n_permutations = 1000,
+  kernel_width   = 1
+)
+
+plot_features(explanation)
+
+
+# ------ #
+
+create_pred_vis <- function(obs){
+  gen_lime_exp <- function(obs){
+    single_explanation <- as.data.frame(test_h2o) %>% 
+      slice(obs) %>% 
+      select(-risk) %>%
+      lime::explain(
+        explainer  = explainer,
+        n_labels   = 1,
+        n_features = 6,
+        n_permutations = 1000,
+        kernel_width   = 1
+      ) %>%
+      as.tibble()
+    return(single_explanation)
+  }
+
+  explanation <- gen_lime_exp(obs)
+
+  type_pal <- c('Supports', 'Contradicts')
+  explanation$type <- factor(ifelse(sign(explanation$feature_weight) == 
+                                      1, type_pal[1], type_pal[2]), levels = type_pal)
+  description <- paste0(explanation$case, "_", explanation$label)
+  desc_width <- max(nchar(description)) + 1
+  description <- paste0(format(description, width = desc_width), 
+                        explanation$feature_desc)
+  explanation$description <- factor(description, levels = description[order(abs(explanation$feature_weight))])
+  explanation$case <- factor(explanation$case, unique(explanation$case))
+
+  explanation_plot_df <- explanation %>%
+    mutate(risk_predictor = case_when(
+      (label == 'Yes' & type == 'Supports') | (label == 'No' & type == 'Contradicts') ~ 'High Risk',
+      (label == 'Yes' & type == 'Contradicts') | (label == 'No' & type == 'Supports') ~ 'Low Risk'
+    )) %>%
+    arrange(-abs(feature_weight)) %>% 
+    head(20)
+
+  return(explanation_plot_df)
+}
+
+#create_pred_vis(1)
+
+# Runs for about 45 minutes
+# Produces a Large list 
+pred <- lapply(1:nrow(application_data), create_pred_vis)
+
+saveRDS(pred, "lime_prediction_results.RDS")
+saveRDS(application_data, "application_data.RDS")
+
+
+
+#####
+#####
+
+billboarder() %>%
+  bb_barchart(
+    data = pred[[1]],
+    mapping = bbaes(x = feature_desc, y = feature_weight, group = risk_predictor),
+    rotated = TRUE,
+    stacked = TRUE
+  ) %>%
+  bb_colors_manual('No' = "#95a5a6", 'Yes' = '#2C3E50') %>%
+  bb_title(text = glue('Feature Contributions to Student Dropout Risk'))
diff --git a/data/application_data.RDS b/data/application_data.RDS
diff --git a/data/lime_prediction_results.RDS b/data/lime_prediction_results.RDS
diff --git a/www/rstudio.png b/www/rstudio.png