Here we select a subset of the most informative features using elastic net regression:

In [27]:
suppressMessages({
    suppressWarnings({
        library(glmnet, quietly = TRUE)
        library(RhpcBLASctl, quietly = TRUE)
        library(foreach, quietly = TRUE)
        library(doSNOW, quietly = TRUE)
    })
})

In [28]:
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'
seed = 42

In [29]:
feature_select_reg <- function(X, y, options = list(alpha = 1, subfolds = 3, error.measure = 'mse'), ...) {
    # select features based on non-zero coefs returns from CV glm
    # documentation: https://glmnet.stanford.edu/reference/glmnet.html, https://glmnet.stanford.edu/reference/cv.glmnet.html

    # GLM type
    if (is.factor(y)) {
        if (nlevels(y) == 1) {
          stop("y is a factor with only one level")
        } else if (nlevels(y) == 2) {
          fam <- "binomial"
        } else {
          fam <- "multinomial"
        }
    } else if (is.numeric(y)) {
        fam <- "gaussian"
    } else {
        stop("y must be of type factor or numeric vector")
    }
    
    if (!("alpha" %in% names(options))) {
        options$alpha <- 1 # lasso
    }
    if (!("error.measure" %in% names(options))) {
        options$error.measure <- 'mse' # lasso
    }
    
    # cv.glmnet needs at least 3 folds, so we need at least three features
    n_samples <- nrow(X)
    if (n_samples < 3) {
        stop("select_lasso() requires more than three samples for internal cross-validation")
    }

    if (!("subfolds" %in% names(options))) {
    if (n_samples >= 5) {
      options$subfolds <- 5
    } else {
      options$subfolds <- 3
    }
    } else {
        if (options$subfolds > n_samples) {
          message("Warning in select_lasso():")
          message("    options$subfolds greater than number of samples")
          message("    setting options$subfolds = number of samples")
          options$subfolds <- n_samples
        }
        if (options$subfolds < 3) {
          message("Warning in select_lasso():")
          message("    options$subfolds was less than 3")
          message("    setting options$subfolds = 3")
          options$subfolds <- 3
        }
    }
    
    # fit an appropriate lasso model with cv validation
    mod <- glmnet::cv.glmnet(X, y, type.measure = options$error.measure, alpha = options$alpha,
                             family = fam, type.multinomial = "grouped",
                             nfolds = options$subfolds, ...)
        
    # ID best fit model
    indices <- which(mod$nzero > 0) # lambdas that generated atleast one non-zero coefficients
    lambdas <- mod$lambda[indices] # lambda value 
    scores <- mod$cvm[indices] # mean cv score at that lambda
    best <- which.min(scores) # if a tie, chooses the one with fewer features
    mod_coeffs <- coef(mod, s = lambdas[best]) # ths can maybe be replaced with s = 'lambda.min'
    
    if (fam == "multinomial") {
        # if the data has multiple responses, the coefficients are a matrix
        # that is returned as a list of columns. type.multinomial = "grouped"
        # forced features to be selected for all responses or for none, so we
        # can get the selected features also by only considering the first
        # column. we just replace lasso_coeffs by this column and proceed as usual
        mod_coeffs <- mod_coeffs[[1]]
    }
    
    mod_coeffs <- mod_coeffs[-1,] # remove intercept
    mod_coeffs <- mod_coeffs[which(mod_coeffs != 0)] # remove non-zero coefs 

    return (names(mod_coeffs))
}

feature_select_iter <- function(X, y, options = list(n_trials = 100, threshold = 0.8, force_select = TRUE, 
                                                        alpha = 1, subfolds = 3, error.measure = 'mse'),
                                par = F, n.cores = 1, ...) {
  
    # ----------------- BEGIN OPTIONS ----------------- #
    # How often it should be repeated
    if (!("n_trials" %in% names(options))) {
        options$n_trials <- 100
    }
    if (!("threshold" %in% names(options))) {
        options$threshold <- 0.8
    }
    # returns the whole data frame of how often a feature was selected
    if (!("force_select" %in% names(options))) {
        options$force_select <- TRUE
    }
    if (!("alpha" %in% names(options))) {
        options$alpha <- 1
    }
    if (!("subfolds" %in% names(options))) {
        options$subfolds <- 3
    }
    if (!("error.measure" %in% names(options))) {
        options$error.measure <- 'mse'
    }
    # ----------------- END OPTIONS ----------------- #

    # vector counting how often each feature is selected
    if (is.null(colnames(X))){colnames(X)<-sapply(1:dim(X)[[2]], function(x) paste0('V', x))}
    feature_count <- rep(0, ncol(X))
    names(feature_count) <- colnames(X)
  
    # run the feature selector trials times and increment the counters
    # for the features that are selected
    if (!par){
        for (trial in 1:options$n_trials) {
            features <- feature_select_reg(X, y, options = list(alpha = options$alpha, subfolds = options$subfolds, 
                                                            error.measure = options$error.measure), ...)
            feature_count[features] <- 1 + feature_count[features]
            }
    }
    else{
        RhpcBLASctl::blas_set_num_threads(round(n.cores/2)) # limit core usage
        cl <- makeCluster(n.cores)
        registerDoSNOW(cl)
        pb <- txtProgressBar(max = options$n_trials, style = 3)
        progress <- function(n_) setTxtProgressBar(pb, n_)
        opts <- list(progress = progress)

        features.all = foreach(trial = 1:options$n_trials, .combine = 'c', .packages = c('glmnet'), .export = c('feature_select_reg'),  
                    .verbose = TRUE, .options.snow = opts) %dopar% {
            features<-feature_select_reg(X, y, options = list(alpha = options$alpha, subfolds = options$subfolds, 
                                                            error.measure = options$error.measure), ...)
        }
        close(pb)
        stopCluster(cl)

        for (features in features.all) {
                feature_count[features] <- 1 + feature_count[features]
            }
        }
                                                
    selected <- feature_count[-which(feature_count <= options$threshold * options$n_trials)] # those above thresh
    if (length(selected) == 0 & options$force_select) {
        selected <- feature_count[which(feature_count == max(feature_count))]
    }
    return(selected)
}

Load files:

In [30]:
y<-read.csv(file.path(data_path,'interim', 'y_train_val.csv'), row.names = 1)
X<-read.csv(file.path(data_path,'interim', 'X_train_val.csv'), row.names = 1)

X<-as.matrix(do.call(cbind, X))
y<-as.matrix(do.call(cbind, y))

Run the lasso:

In [31]:
options = list(n_trials = 10, 
               threshold = 0.8, 
               force_select = TRUE, 
               alpha = 0.05, 
               subfolds = 10, 
               error.measure = 'mse')
selected<-feature_select_iter(X, y, options = options,
                              par = T, n.cores = 20) 
write.csv(as.matrix(selected), paste0(data_path, 'interim/', 'depr_selected_features.csv'))

  |                                                                      |   0%progress will be called as each result is returned
discovered package(s): 
automatically exporting the following variables from the local environment:
  options, X, y 
explicitly exporting variables(s): feature_select_reg
explicitly exporting package(s): glmnet
numValues: 10, numResults: 0, stopped: TRUE
numValues: 10, numResults: 1, stopped: TRUE
returning status FALSE
returning status FALSE
returning status FALSE
returning status FALSE
returning status FALSE
returning status FALSE
returning status FALSE
returning status FALSE
returning status FALSE
first call to combine function
evaluating call object to combine results:
  fun(result.1, result.2, result.3, result.4, result.5, result.6, 
    result.7, result.8, result.9, result.10)
returning status TRUE


In [32]:
length(selected)

In [34]:
y<-read.csv(file.path(data_path, 'processed', 'metastatic_potential.csv'), row.names = 1)
X<-read.csv(file.path(data_path, 'processed', 'epxpr.csv'), row.names = 1)

X<-as.matrix(do.call(cbind, X))
y<-as.matrix(do.call(cbind, y))

In [35]:
X

mean
-1.17698148
-3.66593752
-2.37415973
-2.31942513
2.10532121
1.10997062
0.72302415
1.20733428
-0.29942455
0.02634277


In [7]:
options = list(n_trials = 10, 
               threshold = 0.8, 
               force_select = TRUE, 
               alpha = 0.05, 
               subfolds = 10, 
               error.measure = 'mse')
selected<-feature_select_iter(X, y, options = options,
                              par = T, n.cores = 20) 
write.csv(as.matrix(selected), paste0(data_path, 'interim/', 'depr_selected_features.csv'))

R version 4.4.2 (2024-10-31)
Platform: powerpc64le-conda-linux-gnu
Running under: Red Hat Enterprise Linux 8.3 (Ootpa)

Matrix products: default
BLAS/LAPACK: /nobackup/users/hmbaghda/Software/miniforge3/envs/metastatic_potential/lib/libopenblasp-r0.3.28.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: America/New_York
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] doSNOW_1.0.20       snow_0.4-4          iterators_1.0.14   
[4] foreach_1.5.2       RhpcBLASctl_0.23-42 glmnet_4.1-8       
[7] Matrix_1.6-5       

loaded via a namesp