In [None]:
library(data.table)

In [None]:
setwd(project_directory) # set wd to project directory containing all the sub folder

### Allen Brain Atlas (ABA) data

#### ABA layer specific genes

In [None]:
ABA_genes=readLines("ext_data/aba_mob-genes_raw.txt")
head(ABA_genes)

In [None]:
ABA_genes_dt=rbindlist(lapply(ABA_genes,function(x){
    spl=unlist(strsplit(x,","))
    g=spl[-c(1)];n=spl[1]
    dt=data.table(layer=n,gene=g)
    return(dt)}
                             )
                      )
head(ABA_genes_dt)
write.table(ABA_genes_dt,"ext_data/aba_mob-genes.txt",sep="\t",quote = FALSE,row.names=FALSE)

#### ABA differential genes

In [None]:
ABA_diff_files=list.files(path = "ext_data/","aba_MOB.*.csv",full.names = TRUE)
head(ABA_diff_files)

In [None]:
ABA_diff_dt=data.table()
for (diff_file in ABA_diff_files){
    tab=fread(diff_file)
    tab[,layer:=unlist(strsplit(diff_file,"MOB|\\."))[2]]
    ABA_diff_dt=rbindlist(list(ABA_diff_dt,tab))
}
setnames(ABA_diff_dt,"gene-symbol","gene")
head(ABA_diff_dt)
write.table(ABA_diff_dt,"ext_data/aba_MOB_diff.tsv",sep="\t",quote=FALSE,row.names=FALSE)

### Mouse olfactory bulb (MOB) single cell RNA-seq data (Linnarsson)

Calculating the normalized expression values (relative frequencies) for each relevant cell type for the MOB data

In [19]:
li_mean=fread("ext_data/li_mean_expr.tsv")

In [20]:
nrow(li_mean)

In [21]:
#there are duplicated gene names in the mean expression matrix, but we don't know why, so we leave them in
dupl_genes=unique(li_mean$V1[duplicated(li_mean$V1)])
length(dupl_genes)

In [24]:
li_norm=melt(li_mean[,c(grep("^OB",names(li_mean)),grep("Neuron",names(li_mean),invert = TRUE)),with=FALSE],id.vars = "V1",variable.name = "ClusterName")

In [25]:
li_norm[,norm:=value/sum(value),by="ClusterName"]
li_norm[,log_norm:=log(norm),]
setnames(li_norm,"V1","gene")

In [28]:
N_ct=length(unique(li_norm$ClusterName))
N_ct

In [29]:
#some genes have 0 expression for all of the cell types.
li_norm[,N_zero_ct:=length(unique(ClusterName[norm==0])),by=gene]
length(unique(li_norm[N_zero_ct==N_ct]$gene))

In [31]:
#number of genes that are expressed in at least one cell type
Ngenes=length(unique(li_norm[N_zero_ct!=N_ct]$gene))
Ngenes

In [13]:
write.table(li_norm,"results/li_norm.tsv",sep="\t",quote=FALSE,row.names=FALSE) #data as published

### Triple negative breast cancer (tnbc) single-cell RNA seq data 
https://www.nature.com/articles/s41467-018-06052-0#Sec24  
GSE118389: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE118389

Calculating the normalized expression values (relative frequencies) for each relevant cell type for the breast cancer data

In [3]:
#data
TNBC_data=fread(file.path("ext_data/GSE118389_tnbc/GSE118389_counts_rsem.txt"))
tail(TNBC_data)
dim(TNBC_data)
sum(duplicated(TNBC_data$V1)) #check for duplicated gene names - here are none

"Detected 1534 column names but the data has 1535 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file."

V1,PT089_P1_A01,PT089_P1_A02,PT089_P1_A03,PT089_P1_A04,PT089_P1_A05,PT089_P1_A06,PT089_P1_A07,PT089_P1_A08,PT089_P1_A09,...,PT039_P10_H03_S279,PT039_P10_H04_S280,PT039_P10_H05_S281,PT039_P10_H06_S282,PT039_P10_H07_S283,PT039_P10_H08_S284,PT039_P10_H09_S285,PT039_P10_H10_S286,PT039_P10_H11_S287,PT039_P10_H12_S288
ZXDC,1.76,5.06,4.86,2.57,7.48,7.26,12.1,2.61,7.59,...,0.0,0,0.0,0,1275.78,2.03,0.0,0.0,0.0,85.87
ZYG11A,4.73,111.84,1.26,0.0,1.42,7.26,12.36,4.72,13.45,...,0.0,0,0.0,0,1.02,2.18,3.85,0.0,0.0,6.15
ZYG11B,7.86,2.14,4.43,1.77,2.43,0.0,7.56,2.24,6.63,...,3.35,0,1.91,0,4.12,1.74,1.02,1.12,3.72,23.32
ZYX,0.0,0.0,1.0,0.0,937.0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0.0,3.0,143.0,0.0,0.0,85.0
ZZEF1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0,0.0,0,0.0,0.0,6.0,0.0,0.0,27.0
ZZZ3,0.0,0.0,0.0,2.0,0.0,2006.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,23.0


In [4]:
#annotation
TNBC_annot=fread(file.path("ext_data/GSE118389_tnbc/GSE118389_cell_annot.tsv"))
TNBC_annot[,V1:=as.character(V1),]
TNBC_annot[,V2:=as.character(V2),]
head(TNBC_annot)
dim(TNBC_annot)

V1,V2
PT089_P1_A01,epithelial
PT089_P1_A02,epithelial
PT089_P1_A03,epithelial
PT089_P1_A04,macrophage
PT089_P1_A05,macrophage
PT089_P1_A06,epithelial


In [5]:
TNBC_data_long=melt(TNBC_data,id.vars = "V1")
head(TNBC_data_long)

"'measure.vars' [PT089_P1_A01, PT089_P1_A02, PT089_P1_A03, PT089_P1_A04, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too. Check DETAILS in ?melt.data.table for more on coercion."

V1,variable,value
A1BG,PT089_P1_A01,0.0
A1BG-AS1,PT089_P1_A01,0.0
A1CF,PT089_P1_A01,0.0
A2M,PT089_P1_A01,0.0
A2M-AS1,PT089_P1_A01,0.0
A2ML1,PT089_P1_A01,1.08


In [6]:
TNBC_data_long=merge(TNBC_data_long,TNBC_annot,by.x = "variable",by.y="V1")

In [7]:
setnames(TNBC_data_long,"V2","ClusterName")

In [8]:
tnbc_norm=TNBC_data_long[,.(value=mean(value),N=.N),by=c("V1","ClusterName")]
head(tnbc_norm)

V1,ClusterName,value,N
A1BG,epithelial,3.2591129,868
A1BG-AS1,epithelial,4.6131682,868
A1CF,epithelial,0.06591014,868
A2M,epithelial,633.41102535,868
A2M-AS1,epithelial,1.97836406,868
A2ML1,epithelial,20.12418203,868


In [11]:
tnbc_norm[,norm:=value/sum(value),by="ClusterName"]
tnbc_norm[,log_norm:=log(norm),]
setnames(tnbc_norm,"V1","gene")
tail(tnbc_norm)

gene,ClusterName,value,N,norm,log_norm
ZXDC,Tcell,20.228302,53,3.017484e-05,-10.408502
ZYG11A,Tcell,3.791887,53,5.656409e-06,-12.082721
ZYG11B,Tcell,20.612264,53,3.07476e-05,-10.389699
ZYX,Tcell,124.45283,53,0.000185648,-8.591658
ZZEF1,Tcell,91.45283,53,0.0001364214,-8.899762
ZZZ3,Tcell,55.037736,53,8.210054e-05,-9.407566


In [12]:
N_ct=length(unique(tnbc_norm$ClusterName))
N_ct

In [13]:
tnbc_norm[,N_zero_ct:=length(unique(ClusterName[norm==0])),by=gene]

In [14]:
Ngenes=length(unique(tnbc_norm[N_zero_ct<N_ct]$gene))
Ngenes

In [15]:
write.table(tnbc_norm,"results/tnbc_norm.tsv",sep="\t",quote=FALSE,row.names=FALSE)