In [None]:
library("DESeq2")
library("dplyr")
library("tibble")

In [14]:
PROJECT_DIR_d = "/home/yc2553/projects/HEA/databases/"
PROJECT_DIR_o = "/home/yc2553/projects/HEA/output/" 

# Data normalization

In [8]:
sample_file <- file.path(PROJECT_DIR_d, "PROcap/processed/norm_samples.txt", fsep="/")
coldata <- read.csv(sample_file, row.names=1, sep="\t")
count_file <- file.path(PROJECT_DIR_d, "PROcap/processed/raw_reads_all.txt", fsep="/")
cts <- as.matrix(read.csv(count_file, row.names=1, sep="\t", check.names = FALSE))
stopifnot(all(rownames(coldata) == colnames(cts)))
dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~1)

In [7]:
# Used for most downstream analysis (e.g., clustering, machine learning, PCA).
# blind=TRUE: calculate the variability across all samples.

td <- vst(dds, blind=TRUE)
t_counts <- assay(td)
outputfile <- file.path(PROJECT_DIR_d, "PROcap/processed/norm_reads_vst_all.txt", fsep="/")
write.table(t_counts, file=outputfile, sep="\t")

In [None]:
# Used for calculation of specificity score

n_counts <- counts(dds, normalized=TRUE)
outputfile <- file.path(PROJECT_DIR_d, "PROcap/processed/norm_reads_all.txt", fsep="/")
write.table(n_counts, file=outputfile, sep="\t")

# DE analysis

## Lineage differentiation

In [None]:
# Get DE elements at any pair of time points across pancreatic differentiation as input for DPGP cluster
# Refer to codes here: https://zenodo.org/records/5161189; R->timeseries.pairwise_deseq2.R
# Six pairwise comparison in total

In [None]:
DE_analysis_multiple <- function(sample_file, count_file, outdir, sum_file, fdr_cutoff=0.01/6, fc_cutoff=0){
	coldata <- as.matrix(read.csv(sample_file, row.names=1, sep="\t"))

	# Get No. instances for each condition
	condition_column = coldata[,"Condition"]
	unique_indices <- which(!duplicated(condition_column))
	counts <- sapply(unique_indices, function(i) sum(condition_column == condition_column[i]))
	
	count_data <- as.matrix(read.csv(count_file, row.names=1, sep="\t", check.names = FALSE))
	
	# Keep track of up/down numbers for each comparison
	differential_summary <- data.frame()
	
	# Run pairwise comparisons
	for ( idx1 in 1:length(unique_indices) ) {
	    for ( idx2 in 1:length(unique_indices) ) {
			
			if (idx1 >= idx2 || (counts[idx1] == 1 && counts[idx2] == 1)) {
				  next
			}
	                
	        # Subset data with desired samples
			samples_selected <- c()
			for (idx in c(idx1, idx2)){
				if ( counts[idx]==2 ){
					new_vector <- c(unique_indices[idx], unique_indices[idx]+1)
				}else{
					new_vector <- c(unique_indices[idx])
				}
				samples_selected <- c(samples_selected, new_vector)
			}
						
			coldata_selected <- data.frame(coldata[samples_selected, ])
			colnames(coldata_selected) <- colnames(coldata)
	        unique_conditions <- unique(coldata_selected$Condition)
	        t_baseline <- unique_conditions[1]
	        t_compare <- unique_conditions[2]

			count_data_selected <- count_data[, samples_selected]
	
			# DE analysis
			dds <- DESeqDataSetFromMatrix(countData=count_data_selected,
	                                  colData=coldata_selected,
	                                  design = ~ Condition)
	        dds <- DESeq(dds)        
	
			# For heatmap plotting
			res <- results(dds, contrast=c('Condition', t_compare, t_baseline), alpha=fdr_cutoff)
			resLFC <- lfcShrink(dds, type="ashr", res=res)
			write.table(
	            resLFC,
	            file=gzfile(
	                paste(outdir, gsub(" ", "_", t_compare), '_over_', gsub(" ", "_", t_baseline), '_resultsAll.txt.gz', sep='')),
	            quote=FALSE, sep='\t')
	
			# Get a list of elements/genes with differential signals
			res_noNA <- res[!is.na(resLFC$padj),]
	        res_filt <- res_noNA[res_noNA$padj < fdr_cutoff,]
			write.table(
				res_filt,
				file=gzfile(
					paste(outdir, gsub(" ", "_", t_compare), '_over_', gsub(" ", "_", t_baseline), '_sigResultsAll.txt.gz', sep='')),
				quote=FALSE, sep='\t')

			merge_sigresults <- paste(
						  "zcat ",
						  outdir,
						  "/*sigResultsAll.txt.gz | ",
						  "awk -F '\t' '{ print $1 }' | ",
						  "grep -v baseMean | ",
						  "sort | ",
						  "uniq | ",
						  "gzip -c > ",
						  sum_file,
						  sep=""
						)
			system(merge_sigresults)
	
	        # Save to differential summary
			res_filt_up <- res_filt[res_filt$log2FoldChange > fc_cutoff,]
	        res_filt_down <- res_filt[res_filt$log2FoldChange < fc_cutoff,]
	        compare_summary <- data.frame(
	            compare=paste(gsub(" ", "_", t_compare), '_over_', gsub(" ", "_", t_baseline), sep=""),
	            up=length(rownames(res_filt_up)),
	            down=length(rownames(res_filt_down)))
	        differential_summary <- rbind(differential_summary, compare_summary)
		    }
		}
	return(differential_summary)
	}

In [None]:
outdir = file.path(PROJECT_DIR_o, "DE/pancreas/", fsep="")
sample_file = file.path(outdir, "samples.txt", fsep="")
count_file = file.path(outdir, "raw_reads.txt", fsep="")
sum_file = file.path(outdir, "DE_elements.txt.gz", fsep="")
DE_analysis_multiple(sample_file, count_file, outdir, sum_file)

## Pairwise comparison

In [12]:
DE_analysis_single <- function(sample_file, count_file, outputfile, ref){
		coldata <- read.csv(sample_file, row.names=1, sep="\t")
		cts <- as.matrix(read.csv(count_file, row.names=1, sep="\t", check.names = FALSE))
		stopifnot(all(rownames(coldata) == colnames(cts)))
	
		dds <- DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ Condition)
		dds$Condition <- relevel(dds$Condition, ref = ref)
		dds <- DESeq(dds)
		res <- results(dds, alpha=0.05)
		resLFC <- lfcShrink(dds, type="ashr", res=res)
		write.table(as.data.frame(resLFC), file=outputfile, sep="\t")
		}

In [16]:
# T1D

outdir = file.path(PROJECT_DIR_o, "DE/T1D/", fsep="")
sample_file = file.path(outdir, "samples.txt", fsep="")
count_file = file.path(outdir, "raw_reads.txt", fsep="")
outputfile = file.path(outdir, "DESeq2.txt", fsep="")
DE_analysis_single(sample_file, count_file, outputfile, "Normal")

In [None]:
# CRC

outdir = file.path(PROJECT_DIR_o, "DE/CRC/", fsep="")
sample_file = file.path(outdir, "samples.txt", fsep="")
count_file = file.path(outdir, "raw_reads.txt", fsep="")
outputfile = file.path(outdir, "DESeq2.txt", fsep="")
DE_analysis_single(sample_file, count_file, outputfile, "Lung")