In [1]:
suppressMessages({
    library(edgeR)
    library(tximport)
})

In [2]:
get_samples_list <- function(dataset, drop=c(), keep=c(), age_cutoff=-1){
    # read in target cells that passed QC
    fname <- sprintf('Datasets/%s-labels(1).tsv', dataset)
    samples <- read.table(fname, header=TRUE, sep='\t', stringsAsFactors=FALSE)
    
    # restrict to cell types of interest
    samples <- samples[!(samples$CellType %in% drop), ]
    if (length(keep) > 0){
        samples <- samples[samples$CellType %in% keep, ]
    }
    row.names(samples) <- samples$Cell
    
    # restrict to age range of interest
    if (age_cutoff>=0 & 'Age' %in% colnames(samples)){
        samples <- samples[samples$Age>age_cutoff,]
    }

    # restrict to cells that exist
    dir <- sprintf('/media/foldy_lab/Storage_Analysis/kallisto/kallisto_95/%s_95', dataset)
    files <- file.path(dir, samples$Cell, 'abundance.h5')
    samples <- samples[file.exists(files),]
    
    return (samples)
}

sum_gene_level <- function(txi){
    fname <- 'References/tx2gene.95.tsv'
    tx2gene <- read.table(fname, header=TRUE, sep='\t', stringsAsFactors=FALSE)
    tx2gene <- tx2gene[,c('TXNAME', 'GENESYMBOL')]
    colnames(tx2gene) <- c('TXNAME', 'GENEID')
    
    txi <- summarizeToGene(txi, tx2gene)
    
    return (txi)
}

get_txi_from_files <- function(files){
    txi <- tximport(files, type='kallisto', txOut=TRUE)
    txi <- sum_gene_level(txi)
    
    return (txi)
}

get_txi_from_cells <- function(cells, dir){
    files <- file.path(dir, cells, 'abundance.h5')
    txi <- get_txi_from_files(files)
    
    return (txi)
}

txi_to_edgeR <- function(txi, cells){
    kept = rowSums(txi$counts>5)>-1
    cts <- txi$counts[kept,]
    normMat <- txi$length[kept,]

    # Obtaining per-observation scaling factors for length, adjusted to avoid
    # changing the magnitude of the counts.
    normMat <- normMat/exp(rowMeans(log(normMat)))
    normCts <- cts/normMat

    # Computing effective library sizes from scaled counts, to account for
    # composition biases between samples.
    eff.lib <- calcNormFactors(normCts) * colSums(normCts)

    # Combining effective library sizes with the length factors, and calculating
    # offsets for a log-link GLM.
    normMat <- sweep(normMat, 2, eff.lib, "*")
    normMat <- log(normMat)

    # Creating a DGEList object for use in edgeR.
    dge <- DGEList(cts)
    dge <- scaleOffset(dge, normMat)
    
    # add cell names
    colnames(dge$counts) <- cells
    row.names(dge$samples) <- cells
    
    return (dge)
}

add_celltypes <- function(dge, samples){
    dge$samples$group <- samples$CellType
    if ('Batch' %in% colnames(samples)){
        dge$samples$Batch <- samples$Batch
    }
    if ('Comparison1' %in% colnames(samples)){
        dge$samples$Comparison1 <- samples$Comparison1
        dge$samples$Comparison2 <- samples$Comparison2
        dge$samples$Comparison3 <- samples$Comparison3
        dge$samples$Comparison4 <- samples$Comparison4
    }
    
    return (dge)
}

get_dataset_targets <- function(dataset, drop=c(), keep=c(), age_cutoff=-1){
    dir <- sprintf('/media/foldy_lab/Storage_Analysis/kallisto/kallisto_95/%s_95', dataset)
    samples <- get_samples_list(dataset, drop=drop, keep=keep, age_cutoff=age_cutoff)
    cells <- samples$Cell
    files <- file.path(dir, cells, 'abundance.h5')
    
    return (list(samples=samples, cells=cells, files=files))
}

generate_dataset_data <- function(dataset, drop=c(), keep=c(), age_cutoff=-1, savename=''){
    targets = get_dataset_targets(dataset, drop=drop, keep=keep, age_cutoff=age_cutoff)
    if (savename == ''){
        savename = dataset
    }
    samples = targets$samples
    cells = targets$cells
    files = targets$files
    txi <- get_txi_from_files(files)
    dge <- txi_to_edgeR(txi, cells)
    
    dge <- add_celltypes(dge, samples)
    
    saveRDS(dge, file=sprintf('Datasets/%s_edgeR.RData', savename))
}

In [3]:
generate_dataset_data('Lab_Amanda', age_cutoff=-1)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 
summarizing abundance
summarizing counts
summarizing length
summarizing inferential replicates
