In [1]:
library("phenopath")
library("reticulate")
np <- import("numpy")

In [2]:
set.seed(42)

In [3]:
phenopath_defaults <- function(observations, covariates, ...) {
    # suppressWarnings because I am well aware that it hasn't converged
    return(suppressWarnings(phenopath(observations, covariates, model_mu=TRUE, 
                                      maxiter=50, thin=10, verbose=FALSE, ...)))
}

In [4]:
spearman <- function(x,y){return(cor(x,y,method='spearman'))}

In [5]:
size = 11
seed = 42
number_droplets = 100000
number_batches = 5
results_dirname = 'results'

base_filename = paste(results_dirname, '/', size, '_strains.seed_', 
                      seed, '.', format(number_droplets, scientific=FALSE), '_droplets.iteration_',
                      '#', '.npz', sep='')

In [6]:
get_results <- function(phenopath_results, true_times) {
    uncensored_results <- interaction_effects(phenopath_results)
    censored_results <- significant_interactions(phenopath_results) * uncensored_results
    pearson <- abs(cor(true_times, trajectory(phenopath_results))) # same level of support if pseudotimes are flipped
    spearman <- abs(spearman(true_times, trajectory(phenopath_results)))
    
    results <- list("uncensored_results" = uncensored_results, 
                   "censored_results" = censored_results,
                   "pearson" = pearson,
                   "spearman" = spearman)
    return(results)
}

In [7]:
get_results_filename <- function(base_dir, scaling, iteration_number) {
    iteration_filename = paste('iteration_', iteration_number, '.npz', sep='')
    results_dir = file.path(paste(base_dir, '/', scaling, sep=''))
    results_filename = file.path(paste(results_dir, '/', iteration_filename, sep=''))
    return(results_filename)
}

save_results <- function(results_filename, results) {
    np$savez_compressed(results_filename,
    uncensored_results = results$uncensored_results,
    censored_results = results$censored_results,
    pearson = results$pearson,
    spearman = results$spearman)
}

In [8]:
all_results_dir = 'phenopath_results'
if (!dir.exists(all_results_dir)) {dir.create(all_results_dir)}

log_counts_cov_results_dir = file.path(paste(all_results_dir, '/', 'log_count_covariates', sep=''))
if (!dir.exists(log_counts_cov_results_dir)) {dir.create(log_counts_cov_results_dir)}

for (subdirectory in list.files(path=all_results_dir, full.names=T)) {
    scaled_results_dir = file.path(paste(subdirectory, '/', 'scaled', sep=''))
    if (!dir.exists(scaled_results_dir)) {dir.create(scaled_results_dir)}
    
    unscaled_results_dir = file.path(paste(subdirectory, '/', 'unscaled', sep=''))
    if (!dir.exists(unscaled_results_dir)) {dir.create(unscaled_results_dir)}
}

loop through the iterations of stored results

In [9]:
# very bad naughty code that uses lots of global variables
# this is a self-contained notebook however and not a software library.
# I don't want to rewrite the function and the function calls to unnecessarily be
# much longer when scope rules automatically do the right thing.
lapply_input <- function(iteration_number){
    
    filename = gsub("#", iteration_number, base_filename)
    npzfile = np$load(filename)

    read_log_counts = npzfile[["read_log_counts"]]
    
    merged_droplets_per_batch <- dim(read_log_counts)[1]/number_batches
    true_times = c()
    for (i in 1:number_batches) {true_times <- append(true_times, rep(i, merged_droplets_per_batch))}
    
    start_time <- proc.time()
    log_counts_scaled <- phenopath_defaults(read_log_counts, read_log_counts, scale_y=TRUE)
    results <- get_results(log_counts_scaled, true_times)
    save_results(get_results_filename(log_counts_cov_results_dir, 'scaled', iteration_number), results)
    run_time <- proc.time() - start_time; print(run_time)
    
    start_time <- proc.time()
    log_counts_unscaled <- phenopath_defaults(read_log_counts, read_log_counts, scale_y=FALSE)
    results <- get_results(log_counts_unscaled, true_times)
    save_results(get_results_filename(log_counts_cov_results_dir, 'unscaled', iteration_number), results)
    run_time <- proc.time() - start_time; print(run_time)
    
    # this is supposed to be an embarrassingly parallel for loop, so memory usage should not change with number of iterations
    # but system monitor shows memory usage continually increasing. Hadley Wickham seems to have said that calling `gc`
    # manually for garbage collection should never be necessary, but honestly at this point I don't trust R so...
    gc()
    
    # controversial stylistically but consistent with Python style and since most programming is done
    # in Python, from a practical perspective it's better for me to use stylistic conventions that also work in Python.
    return(NULL)
}

In [10]:
lapply(1:100, lapply_input)

   user  system elapsed 
766.556  52.104 831.443 
   user  system elapsed 
767.189  53.212 834.784 
   user  system elapsed 
765.004  55.501 832.677 
   user  system elapsed 
765.245  56.233 836.683 
   user  system elapsed 
765.278  53.066 825.537 
   user  system elapsed 
738.207  50.868 789.327 
   user  system elapsed 
739.839  47.867 783.766 
   user  system elapsed 
746.158  44.140 784.931 
   user  system elapsed 
748.642  45.021 787.916 
   user  system elapsed 
740.521  46.612 781.634 
   user  system elapsed 
740.594  45.384 780.110 
   user  system elapsed 
750.104  46.091 793.346 
   user  system elapsed 
788.982  60.471 881.122 
   user  system elapsed 
849.975  85.581 988.032 
   user  system elapsed 
810.886  74.224 915.748 
   user  system elapsed 
795.639  74.295 893.047 
   user  system elapsed 
783.710  71.611 874.704 
   user  system elapsed 
763.903  59.646 830.774 
   user  system elapsed 
776.877  61.619 848.783 
   user  system elapsed 
776.737  64.649 854.978 
