# "Hello World" Example in R

In [1]:
library(glmnet)
library(rjson)
library(tidyverse)
library(MASS)

Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-16

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.0.0     [32m✔[39m [34mpurrr  [39m 0.2.5
[32m✔[39m [34mtibble [39m 1.4.2     [32m✔[39m [34mdplyr  [39m 0.7.6
[32m✔[39m [34mtidyr  [39m 0.8.1     [32m✔[39m [34mstringr[39m 1.3.1
[32m✔[39m [34mreadr  [39m 1.1.1     [32m✔[39m [34mforcats[39m 0.3.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32maccumulate()[39m masks [34mforeach[39m::accumulate()
[31m✖[39m [34mtidyr[39m::[32mexpand()[39m     masks [34mMatrix[39m::expand()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m        masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mwhen()[39m       masks [34mforeach[39m::when()

At

## Researcher Loads in Data

In [2]:
3
4

## Selection Algorithm (function)

Relevant metadata:

```
"capture_selection": [
  {
    "selection_type": "set",
    "encoder": "dataframe",
    "name": "selected_vars"
  }
```

Using a map/dictionary metaphor, the "keys" are universal elements of the algorithm. The "values" are the corresponding names of these elements in the notebook code.

After the initial run of the preprocessor on the full dataset, we add an additional entry to the metadata:

```
"original_selection": "some base 64 string"
```

where `"some base 64 string"` is a base 64 string of the dataframe of selected variables

In [3]:
selection_algorithm <- function(X, y) {
    n <- nrow(X)
    p <- ncol(X)
    
    min_success <- 6
    ntries <- 10
    
    make_lambda_grid <- function(X, y) {
        # Return a vector of lambda values where the corresponding lasso model
        # satisfies the following constraint:
        #   number of selected variables < sqrt(0.8 * p)
        p <- ncol(X)
        model <- cv.glmnet(X, y, alpha=1)$glmnet.fit
        lambdas <- model$lambda
        nselected <- model$df  # number of selected vars for each lambda
        return(lambdas[nselected < sqrt(0.8 * p)])
    }

    lambda_grid <- make_lambda_grid(X, y)
    success <- matrix(0, nrow=p, ncol=length(lambda_grid))
    
    for(i in 1:ntries) {
        subsample_indexes <- sample(1:n, floor(n/2), replace=FALSE)
        Xsub <- X[subsample_indexes,]
        noisy_y <- y[subsample_indexes] + rnorm(floor(n/2))
        model <- cv.glmnet(Xsub, noisy_y, alpha=1)
        
        lambdas <- model$lambda
        coefs <- coef(model, lambda_grid)
        success <- success + (coefs[-1,] != 0)
    }
    
    selection_fn <- function(x) {
        return(sum(x > min_success) > 2)
    }
    selected <- apply(X=success, MARGIN=2, FUN=selection_fn)
    vars <- which(selected != 0)  # indexes of selected lambdas
    return(as.numeric(vars))
}

selected_vars <- selection_algorithm(as.matrix(X), y[,1])
selected_vars <- data.frame(selection = selected_vars)
#selected_vars

## Sufficient Statistics, Estimators, Simulation

Relevant metadata:

```
"functions": "stats_computations",
"data_model": {
  "sufficient_statistics": "compute_sufficient_statistics",
  "estimators": "compute_estimators",
  "resample_data": "resample_data"
}
```

The "keys" are universal elements of the algorithm. The "values" are the corresponding names of these elements in the notebook code.

In [4]:
# original
#compute_sufficient_statistics <- function(X, y, fixed_selection) {
#    fixed_selection <- fromJSON(fixed_selection)
#    return(list(t(X) %*% y, sum(y^2)))
#}

# fix attempt
compute_sufficient_statistics <- function(data, fixed_selection) {
    # Computes the sufficient statistic and returns it as a dataframe
    
    fixed_selection <- fromJSON(fixed_selection)
    X = as.matrix(data[["X"]])
    y = as.matrix(data[["y"]])
    
    suff_stat_1 <- t(X) %*% y
    suff_stat_2 <- sum(y^2)
    
    combined <- c(suff_stat_1, suff_stat_2)
    combined  <- data.frame(combined)
    
    return(combined)
}

compute_estimators <- function(suff_stat, fixed_sel, set_sel) {
    fixed_selection <- fromJSON(fixed_sel)
    set_sel <- fromJSON(set_sel)
    return(ginv(t(X) %*% X %*% (t(X) %*% y))[set_sel['selected_vars']])
}

# fix attempt
resample_data <- function(data, fixed_sel) {
    X = as.matrix(data[["X"]])
    y = 0 * as.matrix(data[["y"]])
    #fixed_sel <- fromJSON(fixed_sel)
    #n <- nrow(X)
    #p <- ncol(X)
    #resids <- y - X %*% ginv(t(X) %*% X) %*% (t(X) %*% y)
    #fitted <- y - resids
    
    #resampled <- sample(1:n, n, replace=TRUE)
    #y_tilde <- fitted + resids[resampled]
    y_tilde <- y + rnorm(n = dim(y)[1])
    
    return(list(X = X, y = y_tilde))
}

# Test/display resampling
#data = list(X = X, y = y)
#resample_data(data, selected_vars)

In [5]:
# Simulate data test
#resample_data(data, "{}")
for(name in names(data)) {
    print(name)
}

#X1 = as.matrix(data[["X"]])
#y1 = as.matrix(data[["y"]])
#n <- nrow(X1)
#p <- ncol(X1)
#resids <- y1 - X1 %*% ginv(t(X1) %*% X1) %*% (t(X1) %*% y1)
#fitted <- y - resids

## Selection Indicators

Here, we define two choices of selection indicator functions -- one for fixed selection and one for set selection. The user can pick either, depending on the type of inference being performed.

Relevant metadata:

```
"functions": "sel_indicators",
"data_model": {
  "selection_indicator_function": "get_fixed_sel_indicators"
}```

In [6]:
# TODO: In preprocessor, inject a cell that saves the original selection in the
# kernel. This allows us to compare original_sel_vars with the simulated
# sel vars later.

get_fixed_sel_indicator <- function(original_selection, simulated_selection) {
    # Generates a single indicator variable (1 or 0) for the fixed selection
    # of the simulated data (compared to the original sample).
    
    fixed_sel_indicator <- all.equal(original_selection['selected_vars'], simulated_selection['selected_vars'])
    return(fixed_sel_indicator)  # single indicator
}

get_set_sel_indicators <- function(original_selection, simulated_selection) {
    # Generates a 1-D dataframe of selection indicators (1 or 0) for the set
    # selection of the simulated data (compared to the original sample).

    # This specific function assumes original_sel_vars is one-dimensional, but
    # this doesn't necessarily have to hold as long as the output selection
    # indicators are in a one-dimensional array.
    
    original_sel_vars = original_selection[['selected_vars']]
    simulated_sel_vars = simulated_selection[['selected_vars']]
    # Empty vector of selection indicators
    set_sel_indicators <- c()
    
    # Loop over each original selected variable to see if it is also selected
    # in the simulated data.
    sel_var_count = dim(original_sel_vars)[1]
    for(i in 1:sel_var_count) {
        sel_indicator <- original_sel_vars[i,1] %in% simulated_sel_vars[,1]
        set_sel_indicators <- c(set_sel_indicators, sel_indicator)
    }
    
    # Return the vector of indicators as a data frame
    set_sel_indicators <- data.frame(set_sel_indicators)
    return(set_sel_indicators)
}


set_sel_indicators
<lgl>
True
True
True
True
True
True
True
True
True
True


In [18]:
#library(IRdisplay)

#IRdisplay::display_json('[2,3,4]')
#IRdisplay:::display_raw('application/mytype', FALSE, toJSON(selected_vars), NULL)
#IRdisplay:::display_raw('application/mytype', TRUE, as.raw(c(3,4,5,7,9)), NULL)
#?raw