In [None]:
library(fs)
library(dplyr)
library(CytoML)
library(stringr)
library(ggplot2)
library(flowCore)
library(magrittr)
library(openCyto)
library(ggridges)
library(lubridate)
library(flowWorkspace)

filter <- dplyr::filter

In [None]:
color_pallette <- 
  c("#3f0061",
    "#003c56",
    "#5000a6",
    "#00590a",
    "#7e6000",
    "#ff404e",
    "#ff78df",
    "#02d4ff",
    "#01efae",
    "#fff89f")

data_dir <- 
  "/Users/sauterj1/Documents/Woodlist/Control_Cohort/"

workspace_file <- 
  "/Users/sauterj1/Desktop/normalization 2017 for Jake tets 2.wsp"

flowjo_workspace <- 
  CytoML::open_flowjo_xml(workspace_file)

gatingset <- 
  flowjo_to_gatingset(flowjo_workspace, 
                      path = data_dir,  
                      includeGates = TRUE, 
                      name = 'All Samples')

fcs_filenames <- 
  gatingset %>% 
  sampleNames() %>% 
  str_extract('^.*\\.fcs')

all_fcs_files <-
  list.files('~/Documents/Woodlist/Control_Cohort/', 
             full.names = TRUE, 
             recursive = TRUE) %>% 
  .[str_detect(., '\\.fcs$')] %>% 
  fs::path() %>% as.character()

fcs_files <-
  sapply(fcs_filenames, function(filename) {
    all_fcs_files[str_detect(all_fcs_files, filename)] %>% 
      path() %>% as.character()
  })

# sampleNames(gatingset) <- 
#   paste0('Sample_', seq_along(gatingset))

# For M1 here are my suggestions
# CD15, use “mature grans”
# CD13, use “mature grans”
# CD33, use “monocytes final”
# CD117, Use CD117+ myeloid blasts
# CD34, same
# CD71 erythroid
# CD38 activated lymphocytes
# HLA-DR- B cells (use only positive ones)
# CD45 Lymphocytes final
# CD19 B cells
# 
# For negative controls use negative subsets of lymphocyte final. Please let me know if this works for you

# Getting Metadata on all samples 

### Which machines were these samples measured on?

In [None]:
meta_info_list <- 
  vector('list', 
         length(gatingset))


parse_meta_info <- function(parsed_meta) {
  baseline_date      <- parsed_meta$`CST BASELINE DATE` %>% 
    str_split('T') %>% .[[1]] %>% .[1]
  
  setup_date         <- parsed_meta$`CST SETUP DATE` %>% 
    str_split('T') %>% .[[1]] %>% .[1]
  
  config_create_date <- parsed_meta$`CYTOMETER CONFIG CREATE DATE` %>% 
    str_split('T') %>% .[[1]] %>% .[1]
  
  beads_lot_id       <- parsed_meta$`CST BEADS LOT ID`
  config_name        <- parsed_meta$`CYTOMETER CONFIG NAME`
  
  cytometer <- parsed_meta$`$CYT`
  cyt_num   <- parsed_meta$CYTNUM
  date      <- parsed_meta$`$DATE`
  
  param_names <- 
    colnames(parsed_meta$SPILL)
  
  # marker names
  
  marker_names <- 
    names(parsed_meta) %>% 
    str_detect('P[0-9]+S') %>% 
    parsed_meta[.] %>% 
    unname() %>% unlist() %>% 
    sort()
  
  meta_info_list <- 
    list(
      cytometer = cytometer, 
      cyt_num = cyt_num, 
      date = date, 
      baseline_date = baseline_date, 
      setup_date = setup_date, 
      config_create_date = config_create_date, 
      beads_lot_id = beads_lot_id, 
      config_name = config_name, 
      param_names = param_names, 
      marker_names = marker_names
    ) 
  
  meta_info_list
}



for (i in seq_along(gatingset)) {
  
  fcs_file <- fcs_files[[i]]
  
  cat('\n\nReading: ', basename(fcs_file), ' \n')
  parsed_meta <- 
    suppressWarnings(
      description(read.FCS(fcs_file,
                            transformation = FALSE,
                            which.lines = 1)))
  
  meta_info_list[[i]] <- 
    parse_meta_info(parsed_meta)
}



meta_info_df <- 
  do.call(rbind, meta_info_list) %>% 
  as.data.frame() %>% 
  mutate(cytometer = 
           case_when(
             cyt_num == "V657338000021" ~ "Canto 4",
             cyt_num == "V657338000098" ~ "Canto 5",
             cyt_num == "V657338000099" ~ "Canto 6",
             cyt_num == "R658222R1012" ~ "Fortessa 1", 
             cyt_num == "R65822R1018"  ~ "Fortessa 2", 
             cyt_num == "R66093700072" ~ "Symphony 1", 
             cyt_num == "R66093700081" ~ "Symphony 2", 
             cyt_num == "R66093700082" ~ "Symphony 3", 
             
           )) %>% 
  mutate(
          # sample_name = paste0('Sample_', seq_along(gatingset)), 
         sample_name = basename(fcs_files) %>% str_split('_M1') %>% 
                       purrr::map(~.x[[1]]) %>% unlist() %>% 
                       paste0('_', cytometer), 
         fcs_file = basename(fcs_files)) %>% 
  mutate(
    date = lubridate::dmy(date)) %>% 
  mutate(baseline_date = lubridate::ymd(baseline_date), 
         setup_date = lubridate::ymd(setup_date))  %>% 
    mutate(date = lubridate::ymd(date)) %>% 
    arrange(desc(cytometer), desc(date))


# For M1 here are my suggestions
# CD15, use “mature grans”
# CD13, use “mature grans”
# CD33, use “monocytes final”
# CD117, Use CD117+ myeloid blasts
# CD34, same
# CD71 erythroid
# CD38 activated lymphocytes
# HLA-DR- B cells (use only positive ones)
# CD45 Lymphocytes final
# CD19 B cells
# 
# Set which populations to use for which markers

gs_get_pop_paths(gatingset)


marker_to_control_pops_dict <- 
  c(
    'CD13' = "/viable/singlets/Grans/grans final, use for CD15 /mature grans, use for CD13",
    'CD15' = "/viable/singlets/Grans/grans final, use for CD15 ",
    'CD19' = "/viable/singlets/Lymphs?/lymphs/Lymphocyte final/B cells",
    'CD33' = "/viable/singlets/mono?/monocytes/monocytes final use for CD33",
    'CD34' = "/viable/singlets/SSC-H, CD34 subset/myeloid blasts/CD117+ myeloid blasts",
    'CD38' = "/viable/singlets/Lymphs?/lymphs/Lymphocyte final/Activated lymphocyes, use for CD38 I guess",
    'CD45' = "/viable/singlets/Lymphs?/lymphs/Lymphocyte final",
    'CD71' = "/viable/singlets/erythroid, use for cd71",
    'CD117' = "/viable/singlets/SSC-H, CD34 subset/myeloid blasts/CD117+ myeloid blasts/",
    'HLA' = "/viable/singlets/Lymphs?/lymphs/Lymphocyte final/B cells"
    )

In [None]:
# plot_list <- 
#   vector('list', length(marker_to_control_pops_dict))



# for (i in seq_along(marker_to_control_pops_dict)) {


i <- 1

  marker <- names(marker_to_control_pops_dict)[i]
  population <- marker_to_control_pops_dict[i] %>% unname()
  
  cat('\nMarker: ', marker, '\nPopulation: ', population, '\n\n')
  
  control_pops <- 
    gs_get_singlecell_expression_by_gate(gatingset, 
                                         population, 
                                         other.markers = marker,
                                         inverse.transform = TRUE,
                                         threshold = FALSE)
  
  
  control_pops <-
    lapply(control_pops, 
           function(x) {
               x = asinh(x / 150)
               return(x)         
     })


  #   sapply(control_pops, function(x) nrow(x))
  #   names(marker_to_control_pops_dict)[i]

  new_colname <- paste0('ncells_', marker)
  meta_info_df[[new_colname]] <- NA
  
  for (ctr_pop_idx in seq_along(control_pops)) {
    
    cur_sample_name <-
      names(control_pops)[ctr_pop_idx] %>%
      str_extract('.*\\.fcs')
    
    cur_n_cells <-
      nrow(control_pops[[ctr_pop_idx]])
    
    meta_info_df[
      meta_info_df$fcs_file == cur_sample_name, 
      new_colname] <- cur_n_cells
  }
  
  meta_info_df


    # print(sapply(control_pops, function(x) nrow(x)))
    #   control_pops <-
    #     control_pops[sapply(control_pops, function(x) nrow(x)) > 1000]

  Control_pop <- 
    lapply(seq_along(control_pops), 
           function(idx) {
             
              exprs <- control_pops[[idx]]
              
              col_with_marker <- 
                  which(str_detect(colnames(exprs), marker))
              
              exprs <- 
                exprs[, col_with_marker]
              
              exprs %<>% as.data.frame()

              colnames(exprs) <- marker
              
              if (nrow(exprs)) {

                 exprs$fcs_file <-
                  str_extract(names(control_pops)[idx], '.*\\.fcs') 
                
                exprs$sample_name <- 
                  meta_info_df %>%
                  filter(fcs_file == exprs$fcs_file[1]) %>%
                  select(sample_name) %>% .[[1]]
                
                exprs$cytometer <-
                  meta_info_df %>%
                  filter(fcs_file == exprs$fcs_file[1]) %>%
                  select(cytometer) %>% .[[1]]
                exprs$year <- 
                  meta_info_df %>%
                  filter(fcs_file == exprs$fcs_file[1]) %>%
                  select(date) %>% .[[1]] %>% 
                  lubridate::floor_date(., unit = 'year') %>% 
                  lubridate::year()
                  
              
                
              } else {
                exprs <- NULL
              }
          
              control_pops[[idx]] <- exprs
    })
  
  

  
  plot_df <- 
    Control_pop %>% 
    do.call(rbind, .) %>% 
    set_colnames(c('Expression', 'FCS', 'Sample', 'Cytometer', 'Year')) %>% 
    mutate(Sample = Sample %>%     
                     str_split('\\.fcs') %>% 
                      purrr::map(~.x[[1]]) %>% 
                       unlist()
    ) 
  
  # Agree on ordering of samples for plotting purposes (geom_text)
  sample_levels <- rev(sort(unique(plot_df$Sample)))
  
  plot_df <- 
    plot_df %>% mutate(Sample = factor(Sample, levels = sample_levels))
  
  # Ridgeline plot of expression distributions 
  plot_list[[i]] <-
    plot_df %>% 
    ggplot() + 
    geom_density_ridges_gradient(aes(x = Expression, 
                                     y = Sample, 
                                     fill = stat(x))) +
    scale_fill_gradientn(name = "Expression Value", 
                         colors = color_pallette) +
    scale_y_discrete(drop = FALSE) +
    xlab('') + ylab('') + 
    theme(legend.position = 'None', 
          # axis.text.y = element_blank(),
          # axis.ticks.length.x = unit(0, 'in'),
          axis.text.y = element_text(face = 'bold', 
                                     size = 10), 
          # axis.text.x = element_text(face = 'bold'),
          # axis.text.x = element_blank(), 
          plot.margin = unit(c(0,0.1,-0.1,0.1), "cm"),
          legend.title = element_text(size = 4), 
          plot.title = element_text(face = 'bold', 
                                    size = 10)) + 
    ggtitle(marker) 
  
  min_x <- min(plot_df$Expression)
  max_x <- max(plot_df$Expression)
  span <- max_x - min_x
  label_x1 <- min_x - (.1 * span)
  label_x2 <- max_x + (.15 * span)
  
  
  
  #### GET NCELLS IN ORDER
  summary_data <- rep(NA, length(sample_levels))
  names(summary_data) <- sample_levels
  
  for (cur_sample in names(summary_data)) {
    
    marker_col <- sym(paste0('ncells_', marker))
   
    summary_data[cur_sample] <-
      meta_info_df %>% 
      dplyr::filter(sample_name == cur_sample) %>% 
      select(!!marker_col)
    
  }
  
  
  nlevels <- length(sample_levels)

  plot_list[[i]] <-
    plot_list[[i]] +
    xlim(c(min_x - (.2 * span),
           max_x + (.3 * span))) +
    geom_text(data = 
        data.frame(
          x = rep(label_x2, nlevels),
          y = seq_len(nlevels) + .45),
        aes(x, y),
        label = summary_data,
        size = 4, 
        fontface = 'plain'
    )

}





png('Sample_marker_distributions_inverse_asinh.png', 
    width = 700, height = 600 * length(plot_list))


patchwork::wrap_plots(plot_list, 
                      ncol = 1)

dev.off()