In [2]:
library("ComplexHeatmap")
library("ggplot2")
library("stats") # for PCA
library("xlsx")
library("DESeq2") # for rlog transformation
library("gridExtra") # several ggplots in one pdf
library("circlize") # colorRamp2
library("scales")

“package ‘ComplexHeatmap’ was built under R version 4.2.1”
Loading required package: grid

ComplexHeatmap version 2.14.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference

If you use it in published research, please cite either one:
- Gu, Z. Complex Heatmap Visualization. iMeta 2022.
- Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
    genomic data. Bioinformatics 2016.


The new InteractiveComplexHeatmap package can directly export static 
complex heatmaps into an interactive Shiny app with zero effort. Have a try!

This message can be suppressed by:
  suppressPackageStartupMessages(library(ComplexHeatmap))


“package ‘ggplot2’ was built under R version 4.2.3”
“package ‘xlsx’ was built under R version 4.2.3”
“package ‘DESeq2’ was built under R version 4.2.3”
Loading required package: S4Vectors

“package ‘S4Vectors’ w

In [1]:
setwd("..") # change to main directory

In [36]:
outDir = "Figures-and-Tables/"
sampleAnnotFile = "annotation/annotation-63.csv"
featureCountFile = "data/expressedTiles-featureCounts-25M10percIDS-min5reads20percent.RData"
geneAnnotFile = "annotation/Homo_sapiens.GRCh37.75-chrRename-noHaplo.RData"
rpkmOutfile = paste0(outDir,"gene-expression.xls") # will be produced, all gene expr values
genetypeRemove = c("rRNA","Mt_rRNA")
nbGenesForClustering = 500
annotAttrib = c("group","gender","age") # first two shown in PCA

In [37]:
load(file = featureCountFile)
counts <- as.matrix(fcountGenes$counts)
head(counts)
dim(counts)

Unnamed: 0,SXR0002,SXR0004,SXR0006,SXR0010,SXR0014,SXR0016,SXR0018,SXR0028,SXR0029,SXR0030,⋯,SXR0105,SXR0111,SXR0112,SXR0113,SXR0114,SXR0115,SXR0117,SXR0118,SXR0121,SXR0122
chr1_1337301_1337400,0,0,3,3,10,1,6,14,7,6,⋯,9,19,23,9,0,0,5,0,8,19
chr1_1477001_1477100,0,1,3,2,7,2,4,0,3,4,⋯,3,5,3,4,0,0,2,0,3,3
chr1_1477101_1477200,0,0,0,0,2,0,7,0,4,3,⋯,2,10,5,3,3,0,3,0,2,5
chr1_1716701_1716800,0,0,0,4,6,1,7,6,11,10,⋯,1,6,16,3,0,0,3,0,4,9
chr1_1716801_1716900,0,0,0,3,5,0,8,3,4,1,⋯,1,10,5,5,0,0,1,0,4,14
chr1_1717201_1717300,0,1,1,4,2,2,4,2,9,1,⋯,0,4,2,4,0,0,2,0,0,6


In [38]:
rn = rownames(counts)
regionsCoord <- data.frame(
    chr = gsub("(chr[^_]+).+","\\1",rn),
    start = as.numeric(gsub("chr[^_]+_([^_]+).+","\\1",rn)),
    end = as.numeric(gsub("chr[^_]+_[^_]+_([^_]+)","\\1",rn))
)
head(regionsCoord,3)

Unnamed: 0_level_0,chr,start,end
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
1,chr1,1337301,1337400
2,chr1,1477001,1477100
3,chr1,1477101,1477200


In [39]:
# load sample annotation:
sampleAnnot = read.csv(file = sampleAnnotFile,sep = "\t", dec=".", stringsAsFactors = F )
sampleAnnot$gender = ifelse(sampleAnnot$gender=="f","female","male")
head(sampleAnnot)

Unnamed: 0_level_0,ID,gender,age,group
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>
1,SXR0002,female,80,ccRCC
2,SXR0004,male,50,ccRCC
3,SXR0006,male,68,ccRCC
4,SXR0010,male,65,urolithiasis
5,SXR0014,male,57,ccRCC
6,SXR0016,male,59,ccRCC


In [40]:
rownames(sampleAnnot) = sampleAnnot$ID

In [41]:
load(geneAnnotFile)
geneAnnot = geneAnnot[which(geneAnnot$V3=="gene"),]
geneAnnot$EnsID = gsub(".*gene_id ([^;]+).*","\\1",geneAnnot$V9)
geneAnnot$type = gsub(".*gene_biotype ([^;]+).*","\\1",geneAnnot$V9)
geneAnnot$symbol = gsub(".*gene_name ([^;]+).*","\\1",geneAnnot$V9)
head(geneAnnot$symbol)

In [42]:
geneAnnotGR = GRanges(seqnames = geneAnnot$V1,
                  ranges = IRanges(start = geneAnnot$V4,end = geneAnnot$V5))

In [43]:
geneLength = 100
TPMs = counts
TPMs[,names(totalReads)] <- sapply(names(totalReads),function(x) TPMs[,x]/(geneLength/10^3) )
head(TPMs[,1:4],2)
TPMsums <- colSums(TPMs)
TPMs[,names(totalReads)] <- sapply(names(TPMsums),function(x) TPMs[,x]/(TPMsums[x]/10^6))
head(counts[,1:4],2)
head(TPMs[,1:4],2)

Unnamed: 0,SXR0002,SXR0004,SXR0006,SXR0010
chr1_1337301_1337400,0,0,30,30
chr1_1477001_1477100,0,10,30,20


Unnamed: 0,SXR0002,SXR0004,SXR0006,SXR0010
chr1_1337301_1337400,0,0,3,3
chr1_1477001_1477100,0,1,3,2


Unnamed: 0,SXR0002,SXR0004,SXR0006,SXR0010
chr1_1337301_1337400,0,0.0,43.5262,16.14683
chr1_1477001_1477100,0,4.154791,43.5262,10.76455


In [44]:
# map region coordinates to genes and descr.:
regionsGR = GRanges(seqnames = regionsCoord$chr, 
                ranges = IRanges(start = regionsCoord$start, end = regionsCoord$end) )
# overlapping gene annot, neglect strand
ovlp = as.data.frame(findOverlaps(query = geneAnnotGR,subject = regionsGR))
head(ovlp,2)

regionAnnot = data.frame(name=rownames(counts),regionsCoord)
head(regionAnnot,3)

# add gene symbols and annot:
regionIdx <- unique(ovlp$subjectHits)
for(r in regionIdx) {
  regionAnnot$gene[r] = paste0(
      geneAnnot$symbol[ovlp$queryHits[ovlp$subjectHits == r]],collapse=";")
  regionAnnot$EnsgID[r] = paste0(
      geneAnnot$EnsID[ovlp$queryHits[ovlp$subjectHits == r]],collapse=";")
  regionAnnot$geneType[r] = paste0(
      geneAnnot$type[ovlp$queryHits[ovlp$subjectHits == r]],collapse=";")
}
head(regionAnnot,2)

Unnamed: 0_level_0,queryHits,subjectHits
Unnamed: 0_level_1,<int>,<int>
1,109,1
2,110,1


Unnamed: 0_level_0,name,chr,start,end
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
1,chr1_1337301_1337400,chr1,1337301,1337400
2,chr1_1477001_1477100,chr1,1477001,1477100
3,chr1_1477101_1477200,chr1,1477101,1477200


Unnamed: 0_level_0,name,chr,start,end,gene,EnsgID,geneType
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>
1,chr1_1337301_1337400,chr1,1337301,1337400,RP4-758J18.2;MRPL20,ENSG00000224870;ENSG00000242485,protein_coding;protein_coding
2,chr1_1477001_1477100,chr1,1477001,1477100,SSU72,ENSG00000160075,protein_coding


In [45]:
# save total all reads/TPM/rlog TPM values:

countsDF = as.data.frame(counts)
countsDF = cbind(regionAnnot,countsDF)
dim(countsDF)
head(countsDF,3)

countsDF$sum = rowSums(countsDF[,sampleAnnot$ID],na.rm = T)
countsDF$sd = round(apply(X = countsDF[,sampleAnnot$ID],1, function(x) sd(x,na.rm = T)),1)
countsDF = countsDF[order(countsDF$sum,decreasing = T),]
total = sum(countsDF$sum)
countsDF$percentOfTotal <- round(countsDF$sum/total*100,2)
head(countsDF$percentOfTotal)
colnames(countsDF)[which(colnames(countsDF)=="percentOfTotal")] = "% of total"

Unnamed: 0_level_0,name,chr,start,end,gene,EnsgID,geneType,SXR0002,SXR0004,SXR0006,⋯,SXR0105,SXR0111,SXR0112,SXR0113,SXR0114,SXR0115,SXR0117,SXR0118,SXR0121,SXR0122
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1_1337301_1337400,chr1_1337301_1337400,chr1,1337301,1337400,RP4-758J18.2;MRPL20,ENSG00000224870;ENSG00000242485,protein_coding;protein_coding,0,0,3,⋯,9,19,23,9,0,0,5,0,8,19
chr1_1477001_1477100,chr1_1477001_1477100,chr1,1477001,1477100,SSU72,ENSG00000160075,protein_coding,0,1,3,⋯,3,5,3,4,0,0,2,0,3,3
chr1_1477101_1477200,chr1_1477101_1477200,chr1,1477101,1477200,SSU72,ENSG00000160075,protein_coding,0,0,0,⋯,2,10,5,3,3,0,3,0,2,5


In [46]:
TPMsDF = as.data.frame(TPMs)
TPMsDF = cbind(regionAnnot,TPMsDF)
head(TPMsDF,3)
TPMsDF$sum =rowSums(TPMsDF[,sampleAnnot$ID],na.rm = T)
TPMsDF$sd = round(apply(X = TPMsDF[,sampleAnnot$ID],1, function(x) sd(x,na.rm = T)),1)
TPMsDF = TPMsDF[order(TPMsDF$sum,decreasing = T),]
total = sum(TPMsDF$sum)
TPMsDF$percentOfTotal = round(TPMsDF$sum/total*100,2)
head(TPMsDF$percentOfTotal)
colnames(TPMsDF)[which(colnames(TPMsDF)=="percentOfTotal")] = "% of total"

Unnamed: 0_level_0,name,chr,start,end,gene,EnsgID,geneType,SXR0002,SXR0004,SXR0006,⋯,SXR0105,SXR0111,SXR0112,SXR0113,SXR0114,SXR0115,SXR0117,SXR0118,SXR0121,SXR0122
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1_1337301_1337400,chr1_1337301_1337400,chr1,1337301,1337400,RP4-758J18.2;MRPL20,ENSG00000224870;ENSG00000242485,protein_coding;protein_coding,0,0.0,43.5262,⋯,47.30866,39.61626,69.175429,44.81045,0.0,0,14.280532,0,22.132157,14.901552
chr1_1477001_1477100,chr1_1477001_1477100,chr1,1477001,1477100,SSU72,ENSG00000160075,protein_coding,0,4.154791,43.5262,⋯,15.76955,10.42533,9.022882,19.91576,0.0,0,5.712213,0,8.299559,2.352877
chr1_1477101_1477200,chr1_1477101_1477200,chr1,1477101,1477200,SSU72,ENSG00000160075,protein_coding,0,0.0,0.0,⋯,10.51304,20.85067,15.038137,14.93682,90.30976,0,8.568319,0,5.533039,3.921461


In [47]:
totalReadsDF = data.frame(sample=names(totalReads),non_unique_counts=totalReads)

### regularized logarithmic transformation of TPMs:

In [48]:
# must be converted to integer first
TPMs[,names(totalReads)] = TPMs[,names(totalReads)] + 1 # to avoid log(0)
TPMsRlog = rlog(round(as.matrix(TPMs[,names(totalReads)]) *10))/3
all(rownames(TPMs) == rownames(counts))
rownames(TPMsRlog) = rownames(counts)
head(TPMsRlog,3)
head(TPMs,3)
head(counts,3)

rlog() may take a long time with 50 or more samples,
vst() is a much faster transformation

converting counts to integer mode

-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



Unnamed: 0,SXR0002,SXR0004,SXR0006,SXR0010,SXR0014,SXR0016,SXR0018,SXR0028,SXR0029,SXR0030,⋯,SXR0105,SXR0111,SXR0112,SXR0113,SXR0114,SXR0115,SXR0117,SXR0118,SXR0121,SXR0122
chr1_1337301_1337400,2.123631,1.800643,2.461518,2.179159,2.377046,2.057256,2.232788,2.555575,2.312829,2.205442,⋯,2.548749,2.388191,2.464161,2.414404,2.067634,2.022713,2.343868,2.147383,2.363445,2.3981
chr1_1477001_1477100,1.983648,2.029845,2.380671,1.937125,2.167609,2.107247,2.002109,1.206736,1.939013,1.970228,⋯,2.109202,1.878512,1.747937,2.060209,1.916125,1.860831,1.972645,2.011892,1.966296,1.806975
chr1_1477101_1477200,1.970076,1.59776,1.406514,1.408062,1.810822,1.465409,2.13668,1.329892,2.012175,1.884863,⋯,1.971988,2.065737,1.884898,1.959659,3.223372,1.859096,2.067522,1.99585,1.854515,1.910907


Unnamed: 0,SXR0002,SXR0004,SXR0006,SXR0010,SXR0014,SXR0016,SXR0018,SXR0028,SXR0029,SXR0030,⋯,SXR0105,SXR0111,SXR0112,SXR0113,SXR0114,SXR0115,SXR0117,SXR0118,SXR0121,SXR0122
chr1_1337301_1337400,1,1.0,44.5262,17.14683,29.319627,7.273447,8.978883,118.6263,65.69023,11.898569,⋯,48.30866,40.61626,70.17543,45.81045,1.0,1,15.280532,1,23.132157,15.901552
chr1_1477001_1477100,1,5.154791,44.5262,11.76455,20.823739,13.546894,6.319255,1.0,28.72438,8.265713,⋯,16.76955,11.42533,10.02288,20.91576,1.0,1,6.712213,1,9.299559,3.352877
chr1_1477101_1477200,1,1.0,1.0,1.0,6.663925,1.0,10.308696,1.0,37.96584,6.449284,⋯,11.51304,21.85067,16.03814,15.93682,91.30976,1,9.568319,1,6.533039,4.921461


Unnamed: 0,SXR0002,SXR0004,SXR0006,SXR0010,SXR0014,SXR0016,SXR0018,SXR0028,SXR0029,SXR0030,⋯,SXR0105,SXR0111,SXR0112,SXR0113,SXR0114,SXR0115,SXR0117,SXR0118,SXR0121,SXR0122
chr1_1337301_1337400,0,0,3,3,10,1,6,14,7,6,⋯,9,19,23,9,0,0,5,0,8,19
chr1_1477001_1477100,0,1,3,2,7,2,4,0,3,4,⋯,3,5,3,4,0,0,2,0,3,3
chr1_1477101_1477200,0,0,0,0,2,0,7,0,4,3,⋯,2,10,5,3,3,0,3,0,2,5


In [49]:
TPMsRlogDF = as.data.frame(TPMsRlog)
TPMsRlogDF = cbind(regionAnnot,TPMsRlogDF)
TPMsRlogDF$sum = rowSums(TPMsRlogDF[,sampleAnnot$ID],na.rm = T)
TPMsRlogDF$sd = round(apply(X = TPMsRlogDF[,sampleAnnot$ID],1,  function(x) sd(x,na.rm = T)),1)
TPMsRlogDF = TPMsRlogDF[order(TPMsRlogDF$sum,decreasing = T),]
total = sum(TPMsRlogDF$sum)
TPMsRlogDF$percentOfTotal = round(TPMsRlogDF$sum/total*100,3)
sum(TPMsRlogDF$percentOfTotal)
colnames(TPMsRlogDF)[which(colnames(TPMsRlogDF)=="percentOfTotal")] = "% of total"
sum(TPMsRlogDF$"% of total")
head(TPMsRlogDF,3)

Unnamed: 0_level_0,name,chr,start,end,gene,EnsgID,geneType,SXR0002,SXR0004,SXR0006,⋯,SXR0113,SXR0114,SXR0115,SXR0117,SXR0118,SXR0121,SXR0122,sum,sd,% of total
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr17_66016001_66016100,chr17_66016001_66016100,chr17,66016001,66016100,RP4-758J18.2;MRPL20,ENSG00000224870;ENSG00000242485,protein_coding;protein_coding,7.348876,6.853805,6.592829,⋯,6.482551,7.168025,7.088977,6.734167,7.35624,6.574805,6.825713,425.9435,0.2,0.048
chr19_45981901_45982000,chr19_45981901_45982000,chr19,45981901,45982000,ERCC1,ENSG00000012061,protein_coding,7.092479,6.584624,6.183415,⋯,6.252242,6.922968,6.802854,6.489725,7.061517,6.432612,6.479043,408.6097,0.2,0.046
chr19_45981801_45981900,chr19_45981801_45981900,chr19,45981801,45981900,ERCC1,ENSG00000012061,protein_coding,6.926371,6.47722,6.121954,⋯,6.14573,6.829128,6.717482,6.389897,6.959546,6.323565,6.377113,401.9262,0.2,0.045


### save expression data to file

In [50]:
# get sample order by group:
sampleIDsGroupOrder = order(sampleAnnot$group, sampleAnnot$ID)
sampleIDsGroupOrdered = names(totalReads)[sampleIDsGroupOrder]
totalReadsDF = totalReadsDF[sampleIDsGroupOrder,]
otherColumnNames = setdiff(colnames(countsDF),sampleIDsGroupOrdered)

In [51]:
groupAnnotationDF = as.data.frame(t(c(rep("",length(otherColumnNames)),
                                     sampleAnnot[sampleIDsGroupOrdered,"group"]) ))
colnames(groupAnnotationDF) = c(otherColumnNames,sampleIDsGroupOrdered)

In [52]:
write.xlsx2(x = cbind(group = sampleAnnot[sampleIDsGroupOrdered,"group"],totalReadsDF), 
            file = rpkmOutfile, sheetName = "total reads", row.names = F )

In [53]:
out = countsDF[,c(otherColumnNames,sampleIDsGroupOrdered)]
out = apply(out,2,as.character)
out = rbind(groupAnnotationDF,colnames(out),out) # merge column annotation
write.xlsx2(x = out, file = rpkmOutfile, sheetName = "counts", row.names = F, col.names = F, append = T )

In [54]:
out = TPMsDF[,c(otherColumnNames,sampleIDsGroupOrdered)]
out[,sampleIDsGroupOrdered] = round(out[,sampleIDsGroupOrdered],2)
out = apply(out,2,as.character)
out = rbind(groupAnnotationDF,colnames(out),out) # merge column annotation
write.xlsx2(x = out, file = rpkmOutfile, sheetName = "TPMs", row.names = F, col.names = F, append = T )

In [55]:
out = TPMsRlogDF[,c(otherColumnNames,sampleIDsGroupOrdered)]
out[,sampleIDsGroupOrdered] = round(out[,sampleIDsGroupOrdered],2)
out = apply(out,2,as.character)
out = rbind(groupAnnotationDF,colnames(out),out) # merge column annotation
write.xlsx2(x = out, file = rpkmOutfile, sheetName = "rlogTPMs", row.names = F, col.names = F, append = T )

#### remove genes of unwanted gene types from TPMsRlog:

In [56]:
geneIdsToRemove = geneAnnot$EnsID[which(geneAnnot$type %in% genetypeRemove)]
regionsRemove = rownames(TPMsRlogDF[ TPMsRlogDF$EnsgID %in% geneIdsToRemove,])
length(regionsRemove)
TPMsRlog = TPMsRlog[ setdiff(rownames(TPMsRlog),regionsRemove),]

### produce annotation colors:

In [57]:
sampleAnnotForPlot = sampleAnnot[colnames(TPMsRlog),annotAttrib] 

# define color ramps for diff variables:
myColrRamps <- matrix(data = c("#66ff99","#009999","#000099",
                               "blue4","azure","brown1",
                               "#ffd000","#669900","#006600",
                               "#ff99ff","#6666ff","#000099",
                               "#f1e0c5","#71816d","#342a21"),ncol = 3, byrow = T)
hues <- 0:8/8
usedHueStart <- 1

annotColor <- list()
i<-1
useRamp <- 2

for(i in 1:ncol(sampleAnnotForPlot)) {
  if (is.numeric(sampleAnnotForPlot[,i])) {    # color ramp  when continuous variable
    cat(colnames(sampleAnnotForPlot)[i]," is numeric\n")
    breaks <- quantile(sampleAnnotForPlot[,i],c(0,0.5,1), na.rm = T)
    names(breaks) <- NULL
    annotColor <- c(annotColor, list(colorRamp2(breaks = breaks,myColrRamps[useRamp,])))
    cat("color ramp over: ",myColrRamps[useRamp,],"to breaks",breaks,"\n")
    useRamp <- ifelse(useRamp==nrow(myColrRamps),1,useRamp+1)
  } else {     # discrete variable -> fixed colors
    cat(colnames(sampleAnnotForPlot)[i],"is discrete\n")
    usedHueStart <- (usedHueStart+1) %% length(hues) + 1
    # the more colors we need, the further away
    usedHueEnd <- (usedHueStart + floor(length(unique(sampleAnnotForPlot[,i])))) %% length(hues) + 1
    annotColor <- c(annotColor,list(
        rainbow(s=0.7,v=0.85,n = length(unique(sampleAnnotForPlot[,i])),
                start=hues[usedHueStart], end=hues[usedHueEnd])
    ))
    useRamp <- ifelse(useRamp==nrow(myColrRamps),1,useRamp+1)
    cat("used",useRamp,"\n")
    names(annotColor[[i]]) <- unique(sampleAnnotForPlot[,i])

    # sort by annot attrib name:
    annotColor[[i]] <- annotColor[[i]][order(names(annotColor[[i]]))]
  }
}

names(annotColor) <- colnames(sampleAnnotForPlot)

group is discrete
used 3 
gender is discrete
used 4 
age  is numeric
color ramp over:  #ff99ff #6666ff #000099 to breaks 40 64 80 


## PCA & Clustering

### PCA

In [58]:
# PCA rlog RPKM:
pc <- prcomp(t(TPMsRlog)) # samples must be rows of the data
pcs_plot <- as.data.frame(pc$x)
pcs_plot[,annotAttrib] <- sampleAnnotForPlot[rownames(pcs_plot),annotAttrib]
for (i in 1:length(annotAttrib)) {
  colnames(pcs_plot)[which(colnames(pcs_plot)==annotAttrib[i])] <- as.character(paste0("attrib",i))
}
pcs_plot[,"sampleID"] <- rownames(pcs_plot)

dim(pcs_plot)
head(pcs_plot)
pcaOutFile <- paste0("pca-",paste(annotAttrib[1:2],collapse="-"),".pdf")
maxNameLength <- max(nchar(pcs_plot[,"sampleID"]))

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,⋯,PC58,PC59,PC60,PC61,PC62,PC63,attrib1,attrib2,attrib3,sampleID
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<int>,<chr>
SXR0002,-8.218646,-8.7820967,7.6936122,0.74905949,-0.08936547,2.28170219,-1.08058,5.7875387,-14.24956038,3.7514979,⋯,0.06957035,0.0992461,-0.03361885,-0.0486562641,0.01092218,3.885781e-15,ccRCC,female,80,SXR0002
SXR0004,3.178502,-1.7056351,-2.5023829,-4.58429575,-5.09028778,0.08883844,1.43222,4.2538911,-1.8212295,5.2391384,⋯,-0.10494894,-0.06842265,-0.17986148,-0.4656310363,-0.08555382,1.088019e-14,ccRCC,male,50,SXR0004
SXR0006,9.75708,6.5087193,-0.2543117,-3.94206301,-7.22685494,-1.14271366,8.972354,0.3812244,8.1654291,7.4601039,⋯,0.14941272,0.23987087,0.06505093,0.1176783017,0.05639998,9.978129e-15,ccRCC,male,68,SXR0006
SXR0010,6.775026,-1.1321847,-1.6357908,-0.89998401,-2.30180896,-2.05763869,3.271629,-1.0042016,1.54203168,5.1737254,⋯,-0.32103127,-0.62898663,-0.2685256,0.1342073509,0.09335252,1.750683e-14,urolithiasis,male,65,SXR0010
SXR0014,7.643429,-0.1805346,-2.5787455,-0.01805559,-1.18573342,-0.08363302,-0.384962,-1.1281151,0.06439449,-0.2805813,⋯,0.60379201,0.56849573,-0.10511131,-0.6253513769,-0.22687287,8.146261e-15,ccRCC,male,57,SXR0014
SXR0016,2.318183,6.8291909,1.519002,0.06040541,1.28851498,2.07974863,-1.230017,2.2253503,-0.18744005,-7.8036095,⋯,-0.29979582,0.18161707,-0.07210889,-0.0002813511,0.04449116,6.467049e-15,ccRCC,male,59,SXR0016


In [59]:
# variance explained by PCs:
# https://www.geo.fu-berlin.de/en/v/soga/Geodata-analysis/Principal-Component-Analysis/Principal-component-analysis-in-R/index.html
pc$var= pc$sdev**2
pc$varPercExpl = round(pc$var/sum(pc$var)*100,1)
head(pc$varPercExpl)
varPercExplString = paste0(pc$varPercExpl,"%")
varPercExplString

In [60]:
PCnames = paste0("PC",1:10)
PCnames

### PCA plots for paper
Figure 2, SuppleFigure S3

In [61]:
plots = NULL
for(pcNb in 1:4) {
   p = ggplot(data = pcs_plot, 
              aes_string(x = PCnames[pcNb], y = PCnames[pcNb+1],
                         shape="attrib1",fill="attrib2", color="attrib2")) + 
          geom_point( size=4,alpha=0.6) + 
          labs(#title=expression("PCA of regularized "~log[2]~"(TPMs)"),
               x=paste0(PCnames[pcNb],", ",varPercExplString[pcNb], " variance"),
               y=paste0(PCnames[pcNb+1],", ",varPercExplString[pcNb+1], " variance"), 
               shape=annotAttrib[1], color=annotAttrib[2])  + guides(fill=F)  +
          geom_text(aes(label=sampleID),hjust=-0.3, vjust=0.5,size = 1.4, color="#555555") + 
          scale_x_continuous(limits = c(min(pcs_plot$PC1), max(pcs_plot$PC1)*1.5)) +
          scale_shape_manual(values=unique(as.numeric(as.factor(pcs_plot$attrib1)))+20)
    plots = append(plots,list(p))
    p
}

In [62]:
margin = theme(plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"))
png(filename = paste0(outDir,"SupplFigure-S3-pca-gender-group.png"), width=3000, height=2400, res=300)
grid.arrange(grobs = lapply(plots, "+", margin),nrow=2)
dev.off()

“[1m[22mRemoved 3 rows containing missing values (`geom_point()`).”
“[1m[22mRemoved 3 rows containing missing values (`geom_text()`).”
“[1m[22mRemoved 1 rows containing missing values (`geom_point()`).”
“[1m[22mRemoved 1 rows containing missing values (`geom_text()`).”


In [63]:
png(filename = paste0(outDir,"Figure-2-pca-gender-group.png"), width=1600, height=1200, res=300)
plots[[1]]
dev.off()

### sample distance clustering heatmap

In [64]:
print("sample distance...")
# sample distance rlog RPKM:
sampleDists <- dist( t( TPMsRlog ) )
as.matrix( sampleDists )[1:4,1:4]
sampleDistMatrix <- as.matrix( sampleDists )

maxRowWidth <- max(nchar(rownames(sampleDistMatrix)))/3

quant <- quantile(as.numeric(sampleDistMatrix),0:20/20, na.rm=T)
col = colorRamp2(quant, colorRampPalette(c("#0000b3", "white", "#cc0000"))(21))

all(rownames(sampleDistMatrix) == rownames(sampleAnnotForPlot))
maxNameLength <- max(nchar(rownames(sampleDistMatrix)))

outf <- paste(outDir,"SupplFigure-S4-sampleDist-heatmap.png",sep="")

png(width = 3000,height = 3000,file = outf, res = 250)
par(oma=c(2,5,2,2))
ha <- HeatmapAnnotation(df=sampleAnnotForPlot,col=annotColor,na_col = "grey", show_legend = T, 
                        show_annotation_name = T)

Heatmap(sampleDistMatrix, col=col, bottom_annotation = ha, 
        name="distance",#column_title = "sample distance - rlog(TPM)",
        show_column_names=T, column_names_max_height = unit(maxNameLength/4,"cm"), 
        column_names_gp = gpar(fontsize = 10),row_names_gp = gpar(fontsize = 10),
        row_names_max_width = unit(maxNameLength/4, "cm"), column_dend_height = unit(30, "mm"))
dev.off()

[1] "sample distance..."


Unnamed: 0,SXR0002,SXR0004,SXR0006,SXR0010
SXR0002,0.0,40.75535,48.77871,41.01392
SXR0004,40.75535,0.0,37.88004,30.74493
SXR0006,48.77871,37.88004,0.0,36.46403
SXR0010,41.01392,30.74493,36.46403,0.0
