updates to network.predictability

o a more robust solution for categorical data with categories made to start at 1 but are later converted to their proper value on output o quadratic weighted kappa is used for `"Weighted"` accuracy when ordinal data are used
hfgolino · Feb 26, 2024 · 7d00fb4 · 7d00fb4
1 parent 1843285
commit 7d00fb4
Show file tree

Hide file tree

Showing 4 changed files with 192 additions and 106 deletions.
diff --git a/R/helpers.R b/R/helpers.R
@@ -2903,12 +2903,12 @@ continuous_accuracy <- function(prediction, observed)
 
 #' @noRd
 # Categorical Accuracy (for single variable) ----
-# Updated 19.02.2024
+# Updated 26.02.2024
 categorical_accuracy <- function(prediction, observed)
 {
 
   # Get maximum categories
-  max_category <- max(prediction, observed)
+  max_category <- max(prediction, observed, na.rm = TRUE)
 
   # Set category sequence
   category_sequence <- seq_len(max_category)
@@ -2928,38 +2928,35 @@ categorical_accuracy <- function(prediction, observed)
   # Populate accuracy table
   accuracy_table[table_names$observed, table_names$prediction] <- tabled
 
-  # Get empirical frequencies
-  frequency <- table(factor(observed, levels = category_sequence))
+  # Get frequencies
+  observed_frequency <- table(factor(observed, levels = category_sequence))
+  prediction_frequency <- table(factor(prediction, levels = category_sequence))
 
   # Get total values
-  total_values <- sum(frequency)
-
-  # Get maximum possible distance incorrect
-  max_distance <- pmax(
-    abs(max_category - category_sequence), # distance from maximum category
-    abs(min(category_sequence) - category_sequence) # distance from minimum category
-  )
-
-  # Get minimum possible weight accuracy
-  minimum_weighted <- sum(0.5^max_distance * frequency)
+  total_values <- sum(observed_frequency)
 
   # Get diagonal of table (correct predictions)
   correct <- diag(accuracy_table)
 
+  # Standardize tables
+  observed_standard <- observed_frequency / total_values
+  prediction_standard <- prediction_frequency / total_values
+  confusion_matrix <- accuracy_table / total_values
+
+  # Compute weights
+  weights <- outer(
+    category_sequence, category_sequence,
+    FUN = function(x, y){(x - y)^2}
+  )
+
   # Return accuracy by category
   return(
     c(
-      correct / frequency,
       accuracy = sum(correct, na.rm = TRUE) / total_values,
-      balanced = sum(
-        correct / rowSums(accuracy_table, na.rm = TRUE),
-        na.rm = TRUE
-      ) / max_category,
-      weighted = (sum(0.5^abs(prediction - observed), na.rm = TRUE) - minimum_weighted) /
-                 # weighted total
-                 (total_values - minimum_weighted)
-                 # normalization: all differences of 0 minus
-                 # all differences of maximum absolute difference
+      weighted = 1 - (
+        sum(weights * confusion_matrix) / # observed
+        sum(weights * tcrossprod(observed_standard, prediction_standard)) # expected
+      )
     )
   )
 

diff --git a/R/network.generalizability.R b/R/network.generalizability.R
@@ -238,7 +238,7 @@
 #' @export
 #'
 # Perform generalizability analysis ----
-# Updated 19.02.2024
+# Updated 26.02.2024
 network.generalizability <- function(
     data,
     # generalizability arguments
@@ -453,54 +453,32 @@ network.generalizability <- function(
       rbind, lapply(prediction_results, function(x){x$predictions})
     )
 
-    # Initialize adjusted predictions
-    adjusted_predictions <- prediction_matrix
-
     # Get flags
     flags <- attr(prediction_results[[1]]$results, "flags")
 
-    # Get column sequence
-    column_sequence <- seq_len(dimensions[2])
-
-    # Get ranges
-    ranges <- nvapply(
-      column_sequence, function(i){
-        range(prediction_matrix[,i], data[,i], na.rm = TRUE)
-      }, LENGTH = 2
-    )
-
-    # Ensure 'prediction_matrix' and 'data' start at 1
-    for(i in column_sequence){
+    # Check for categories
+    if(any(flags$categorical)){
 
-      # Check for categories
-      if(flags$categorical[[i]]){
+      # Set up for combined
+      combined <- rbind(data, prediction_matrix)
 
-        # Check for lowest category
-        minimum_value <- ranges[1,i]
+      # Get original sample size
+      original_n <- dimensions[1]
 
-        # Re-adjust minimum category to 1 for new data
-        if(minimum_value <= 0){
+      # Ensure categories start at 1
+      one_start_list <- ensure_one_start(combined, flags, original_n)
 
-          # Value to add
-          add_value <- abs(minimum_value) + 1
-
-          # Add value to 'data' and 'prediction_matrix'
-          data[,i] <- data[,i] + add_value
-          prediction_matrix[,i] <- prediction_matrix[,i] + add_value
-
-        }
-
-      }
+      # Sort out data
+      original.data <- one_start_list$original.data
+      newdata <- one_start_list$newdata
 
     }
 
     # Set up as if at the end of `network.predictability`
     metric_summary <- setup_results(
-      predictions = prediction_matrix,
-      adjusted_predictions = adjusted_predictions,
-      newdata = data, flags = flags,
+      predictions = newdata, newdata = original.data, flags = flags,
       betas = NULL, node_names = dimnames(data)[[2]],
-      dimensions = dimensions, dim_sequence = column_sequence
+      dimensions = dimensions, dim_sequence = seq_len(dimensions[2])
     )
 
     # Attach categories to results