### Load libraries and functions

In [4]:
#Load libraries
library(Seurat)
library(stringr)
library(viridis)
library(ggplot2)
library(cowplot)
library(cluster)
library(data.table)
library(foreach)
library(doParallel)
library(proxy)
library(ComplexHeatmap)
library(circlize)
library(igraph)
library(qvalue)
library(dplyr)
library(viridis)
library(VGAM)
library(forcats)
library(grDevices)
library(graphics)
library(RColorBrewer)
library(pheatmap)
library(Cairo)
library(reshape2)
library(R.utils)
library(Rcpp)
library(parallelDist)
set.seed(seed = 42)

Attaching SeuratObject

Loading required package: viridisLite

Loading required package: iterators

Loading required package: parallel


Attaching package: ‘proxy’


The following objects are masked from ‘package:stats’:

    as.dist, dist


The following object is masked from ‘package:base’:

    as.matrix


Loading required package: grid

ComplexHeatmap version 2.11.1
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference

If you use it in published research, please cite:
Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
  genomic data. Bioinformatics 2016.

The new InteractiveComplexHeatmap package can directly export static 
complex heatmaps into an interactive Shiny app with zero effort. Have a try!

This message can be suppressed by:
  suppressPackageStartupMessages(library(ComplexHeatmap))


circlize version 0.4.14
CRA

In [2]:
#load basic functions
createEmptyDf = function( nrow, ncol, colnames = c() ){
  data.frame( matrix( vector(), nrow, ncol, dimnames = list( c(), colnames ) ) )
}

tableread_fast = function(i, header=TRUE, quote="", sep=","){
  tmp = fread(i, header=header, sep=sep, quote=quote, nThread=32)
  tmp = as.data.frame(tmp)
  return(tmp)
}

### Download dataset

In [4]:
#Please download data from following http and put them in your working directory

#SCT_raw.data.tar.gz
#https://drive.google.com/file/d/1xyffNUUdoH8qD2O9ua6yMTqVAwFJNrRm/view?usp=sharing

#scTCR_raw.data.tar.gz
#https://drive.google.com/file/d/1UOgi6re2WdIfX6-e7MnRrCJI1RjeNcC7/view?usp=sharing

#Bulk TCRseq data with AIM status
#https://drive.google.com/file/d/18egQKUu4Htoqg9TkuRXh6Kn_-z3xySP4/view?usp=sharing

###Unzip rawdata
gunzip("SCT_raw.data.tar.gz")
gunzip("scTCR_raw.data.tar.gz")
gunzip("WGCNA_AIMcombined_selected.tar.gz")
untar("SCT_raw.data.tar")
untar("scTCR_raw.data.tar")
untar("WGCNA_AIMcombined_selected.tar")

ERROR: Error in parse(text = x, srcfile = src): <text>:2:47: unexpected symbol
1: ##Download dataset
2: system("wget --load-cookies /tmp/cookies.txt "https
                                                 ^


### Create Seurat object and filter genes

In [8]:
###"ID28" in single-cell dataset corresponds to "ID027" in TCR repertoire dataset ###

##load data of ID02
#Put gene list in the target panel in the working directory ("BD_genes_human")

###Input layer
set.seed(seed = 42)

sample.name <- "ID02"
out.dir <- "scTCR_raw.data"

################################Processing layer##########################################
dir.create(out.dir)

##Load targeted gene expression data
input_name <- str_c("raw.data/", sample.name, "Target_results/matrix/", sep = "") %>% 
                str_c("matrix_inflection_", sample.name, "Target.txt", sep = "")
compressed.name <- str_c(input_name, ".gz", sep = "")
gunzip(compressed.name)
#Create Seurat object
matrix <- tableread_fast(input_name, header = TRUE, quote="", sep="\t")
row.names(matrix) <- matrix$V1
matrix <- dplyr::select(matrix, -V1)
seu1 <- CreateSeuratObject(counts=matrix, project = "seu", min.cells = 3, min.features = 10)
#Output cell barcode information for scTCR analysis
name.output <- str_c(out.dir, sample.name, sep = "/") %>% str_c("BC_target.txt", sep = "_")
BC <- row.names(seu1@meta.data)
write.table(BC, name.output, row.names = FALSE, quote = FALSE, col.names = FALSE)

##Load protein expression data
input_name <- str_c("raw.data/", sample.name, "Abseq_results/matrix/", sep = "") %>% 
                str_c("Hashtag_top1M_", sample.name, "Abseq.txt", sep = "")
compressed.name <- str_c(input_name, ".gz", sep = "")
#gunzip(compressed.name)
#Extract cells in SCT seurat object
Abseq.matrix <- tableread_fast(input_name, header = TRUE, quote="", sep="\t")
Abseq.matrix <- dplyr::filter(Abseq.matrix, CellBC %in% BC)
row.names(Abseq.matrix) <- Abseq.matrix$CellBC
Abseq.matrix <- dplyr::select(Abseq.matrix, -CellBC)
Abseq.matrix <- t(Abseq.matrix)
all.equal(rownames(seu1@meta.data), colnames(Abseq.matrix)) #Certify

##Combine ADT data into SCT seurat object
adt_assay <- CreateAssayObject(counts = Abseq.matrix)
seu1[["ADT"]] <- adt_assay
Assays(seu1)

“'scTCR_raw.data' already exists”


In [10]:
##load data of ID02
#Put gene list in the target panel in the working directory ("BD_genes_human")

###Input layer
set.seed(seed = 42)

sample.name <- "ID28"
out.dir <- "scTCR_raw.data"

################################Processing layer##########################################
dir.create(out.dir)

##Load targeted gene expression data
input_name <- str_c("raw.data/", sample.name, "Target_results/matrix/", sep = "") %>% 
                str_c("matrix_inflection_", sample.name, "Target.txt", sep = "")
compressed.name <- str_c(input_name, ".gz", sep = "")
gunzip(compressed.name)
#Create Seurat object
matrix <- tableread_fast(input_name, header = TRUE, quote="", sep="\t")
row.names(matrix) <- matrix$V1
matrix <- dplyr::select(matrix, -V1)
seu2 <- CreateSeuratObject(counts=matrix, project = "seu", min.cells = 3, min.features = 10)
#Output cell barcode information for scTCR analysis
name.output <- str_c(out.dir, sample.name, sep = "/") %>% str_c("BC_target.txt", sep = "_")
BC <- row.names(seu2@meta.data)
write.table(BC, name.output, row.names = FALSE, quote = FALSE, col.names = FALSE)

##Load protein expression data
input_name <- str_c("raw.data/", sample.name, "Abseq_results/matrix/", sep = "") %>% 
                str_c("Hashtag_top1M_", sample.name, "Abseq.txt", sep = "")
compressed.name <- str_c(input_name, ".gz", sep = "")
gunzip(compressed.name)
#Extract cells in SCT seurat object
Abseq.matrix <- tableread_fast(input_name, header = TRUE, quote="", sep="\t")
Abseq.matrix <- dplyr::filter(Abseq.matrix, CellBC %in% BC)
row.names(Abseq.matrix) <- Abseq.matrix$CellBC
Abseq.matrix <- dplyr::select(Abseq.matrix, -CellBC)
Abseq.matrix <- t(Abseq.matrix)
all.equal(rownames(seu2@meta.data), colnames(Abseq.matrix)) #Certify

##Combine ADT data into SCT seurat object
adt_assay <- CreateAssayObject(counts = Abseq.matrix)
seu2[["ADT"]] <- adt_assay
Assays(seu2)

“'scTCR_raw.data' already exists”


“Some cell names are duplicated across objects provided. Renaming to enforce unique cell names.”


In [13]:
##Merge data of ID02 and ID28

#Input layer
sample.name <- "COVID_merged"

########################## Processing layer ###############################
seu <- merge(x=seu1, y = seu2, add.cell.ids = c("ID02", "ID28"))

name.output <- str_c(sample.name, "rda", sep = ".")
save(seu, name.output)

In [15]:
###Gene filtering

##Input layer
dir.name=("Seurat_plots/")
sample.name <- "COVID_merged"
gene.list.name <- "BD_genes_human.txt"

########################## Processing layer ###############################
dir.create(dir.name)

##load gene list of BD target panel
BD_genes <- read.table(gene.list.name, header = TRUE)
BD_genes <- as.vector(BD_genes$Genesymbol)

##Chose BD target gene list for downstream analysis
res1 = seu@assays$RNA@counts
res1 =res1[res1@Dimnames[[1]] %in% BD_genes,]
ngenes <- length(res1@Dimnames[[1]])
seu@assays$RNA@counts <- res1
seu@assays$RNA@data <- res1

#Scatter plot for gene/read count
file.name=paste(dir.name, sample.name, "Reads.Genes.png", sep='')
png(file.name, width = 512, height = 400)
FeatureScatter(object = seu, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", pt.size = 0.3) +
  geom_hline(yintercept=10) +
  scale_x_log10()
dev.off()

#RidgePlot for gene/read count, 
file.name=paste(dir.name, sample.name, "nCount_RNA_log.png", sep='')
png(file.name, width = 512, height = 400)
RidgePlot(object = seu, features = "nCount_RNA", group.by="orig.ident", ncol = 1) +
  scale_x_log10()
dev.off()

file.name=paste(dir.name, sample.name, "nGenes.png", sep='')
png(file.name, width = 512, height = 400)
RidgePlot(object = seu, features = "nFeature_RNA", group.by="orig.ident", ncol = 1)
dev.off()

#save Seurat object
name.output <- str_c(sample.name, "rda", sep = ".")
save(seu, file=name.output)

“'Seurat_plots' already exists”


Scale for 'x' is already present. Adding another scale for 'x', which will
replace the existing scale.

Picking joint bandwidth of 0.0268



Picking joint bandwidth of 5.33



### 2nd analysis

In [50]:
load("COVID_merged.rda")

### Exclude doublet using CD4/CD8 protein expression by ADT

In [51]:
## Check doublet using CD4 and CD8 ADTs
#CD4 and CD8 double positive cells are regarded as doublet

###Input layer
fea1 <- "CD4"
fea2 <- "CD8"
#Threshold for positive/negative
xthres <- 3 #for fea1
ythres1 <- 2.5 #for fea2
ythres2 <- 1.5 #for fea2

dir.name="S5A"

########################## Processing layer ###############################
DefaultAssay(seu) <- "ADT"
dir.create(dir.name)

#Normalize data
seu <- NormalizeData(seu, normalization.method = "CLR", margin = 2, assay = "ADT")
# Variable feature: All ADT Features
VariableFeatures(seu) <- rownames(seu[["ADT"]])
#Scaling data 
seu = ScaleData(object = seu)

# Draw ADT scatter plots (like biaxial plots for FACS). Note that you can even 'gate' cells if
# desired by using HoverLocator and FeatureLocator
file.name = str_c(dir.name, "FACSPlot", sep='/') %>% str_c(fea1, fea2, "png", sep = ".")
png(file.name, width = 360, height = 360)
p <- FeatureScatter(seu, feature1 = fea1, feature2 = fea2)
p <- p + geom_hline(yintercept = ythres1, size=0.5) +
        geom_hline(yintercept = ythres2, size=0.5) +         
        geom_vline(xintercept = xthres, size=0.5)
plot(p)
dev.off()

“'S5A' already exists”
Normalizing across cells

Centering and scaling data matrix



### Separate CD4 and CD8 cells and dimentional reduction

In [52]:
##Exclude doublet /normalizing RNA and Abseq data / PCA and JackStraw for RNA data

##Input layer
dir.name=("Seurat_plots_2nd/")
sample.CD4 <- "COVID_merged_CD4"
sample.CD8 <- "COVID_merged_CD8"
resol <- 0.5 # resolution for clustering

########################## Processing layer ###############################
dir.create(dir.name)
##Separate CD4 and CD8 cells
DefaultAssay(seu) <- "ADT"
seu.CD8 <- subset(seu, subset = CD4 < 3 & CD8 > 1.5)
seu.CD4 <- subset(seu, subset = CD4 > 3 & CD8 < 2.5)

seus <- list(seu.CD8, seu.CD4)
sample.names <- c(sample.CD8, sample.CD4)

##Process Seurat object (CD4 and CD8) with iteration
for(i in 1:length(seus)){
    #load seurat object
    seu <- seus[[i]]
    sample.name <- sample.names[i]
    
    ##For RNA
    DefaultAssay(seu) <- "RNA"
    #Normalizing and scaling data
    seu = NormalizeData(object = seu, scale.factor=1000000)
    VariableFeatures(seu) <- rownames(seu[["RNA"]]) #Use all genes in target panel for Variable features
    ngenes <- length(x = seu@assays$RNA@var.features)
    all.genes <- rownames(seu)
    seu = ScaleData(object = seu, vars.to.regress = c("nCount_RNA"), features = all.genes) 
    #Perform PCA
    seu = RunPCA(object = seu, features = seu@assays$RNA@var.features, npcs = 100)
    #JackStraw
    seu = JackStraw(object = seu, num.replicate = 100, dims = 50)
    seu <- ScoreJackStraw(object = seu, dims = 1:50, score.thresh = 0.05)
    file.name <- str_c(dir.name, sample.name, sep='') %>% str_c("Jackstraw.png", sep='.')
    png(file.name, width = 1250, height = 500)
    JackStrawPlot(object = seu, dims = 1:50)
    dev.off()
    #Determine PCs used for clustering/tSNE analysis (dims.use)
    #Extract PCs which fulfill the pvalue threshold
    tmp = as.data.frame(seu@reductions$pca@jackstraw@overall.p.values)
    tmp1 = tmp[tmp$Score>0.05,1]
    dims= c(1:(min(tmp1)-1))
    dims

    ##For Abseq
    DefaultAssay(seu) <- 'ADT'
    #Normalizing and scaling data
    seu <- NormalizeData(seu, normalization.method = "CLR", margin = 2, assay = "ADT")
    VariableFeatures(seu) <- rownames(seu[["ADT"]])
    seu = ScaleData(object = seu) 
    #Perform PCA
    seu = RunPCA(object = seu, reduction.name = 'apca')
    
    #Clustering Weighted Nearest Neighbor
    seu <- FindMultiModalNeighbors(seu, reduction.list = list("pca", "apca"), 
                                   dims.list = list(dims, dims))
    seu <- FindClusters(object = seu, graph.name = "wsnn", algorithm = 3, resolution =resol)

    #Dimentional reduction
    seu <- RunUMAP(seu, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_")

    #Visualization using UMAP
    p1 = DimPlot(object = seu, reduction = "wnn.umap", label = TRUE, label.size = 10, pt.size = 0.5) +
                theme(axis.title.x = element_text(size=10, family = "Arial"), 
                        axis.title.y = element_text(size=10, family = "Arial"), 
                        axis.text.x = element_text(size=10, colour = 1, family = "Arial"), 
                        axis.text.y = element_text(size = 10, colour = 1, family = "Arial")) +
                theme(panel.border = element_rect(fill = NA, size = 1)) 
    p2 = DimPlot(object = seu, reduction = "wnn.umap", label = FALSE, label.size = 10, pt.size = 0.5, group.by = "orig.ident") +
                theme(axis.title.x = element_text(size=10, family = "Arial"),
                        axis.title.y = element_text(size=10, family = "Arial"),
                        axis.text.x = element_text(size=10, colour = 1, family = "Arial"),
                        axis.text.y = element_text(size = 10, colour = 1, family = "Arial")) +
                theme(panel.border = element_rect(fill = NA, size = 1)) 
    legend1 <- cowplot::get_legend(p1)
    legend2 <- cowplot::get_legend(p2)
    p1 = p1 + theme(legend.position = 'none')
    p2 = p2 + theme(legend.position = 'none')
    file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("wnn.umap_reso", resol, ".png", sep='')
    save_plot(file = file.name, plot_grid(p1, legend1, p2, legend2, ncol=2, nrow=2), device="png", 
              units="in", dpi = 600, base_width = 10, base_height = 10, limitsize=FALSE)

    ###Extract marker genes
    ##For RNA
    DefaultAssay(seu) <- 'RNA'
    seu.markers = FindAllMarkers(seu, verbose = TRUE, test.use="wilcox", only.pos=TRUE, min.pct=0.1, features.use = NULL, return.thresh=0.05)
    #Create heatmap with top10 marker genes
    top10 = seu.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)
    top10 = as.data.frame(top10)
    top10  = top10 [!duplicated(top10$gene),]
    top10 = top10 %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
    top10 = as.data.frame(top10)
    file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("RNA.marker_res", resol, ".png", sep='')
    p <- DoHeatmap(seu, features = top10$gene, disp.min = -2.5, disp.max = 2.5, size = 8)
    ggsave(file = file.name, plot = p, device="png", units="in", dpi = 300,
           width = 20, height = 20, limitsize=FALSE)
    #Output marker gene table
    seu.markers$cluster = as.numeric(seu.markers$cluster)
    seu.markers = seu.markers %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
    seu.markers = as.data.frame(seu.markers)
    file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("RNA.ALLmarkers_minpct0.1_Adj_p0.05.txt", sep='')
    fwrite(seu.markers, file.name, row.names=F, col.names=T, sep="\t", quote=F)
    file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("RNA.ALLmarkers_minpct0.1_Adj_p0.05.rda", sep='')
    save(seu.markers, file=file.name)

    ##For ADT
    DefaultAssay(seu) <- 'ADT'
    seu.markers = FindAllMarkers(seu, verbose = TRUE, test.use="wilcox", only.pos=TRUE, min.pct=0.1, features.use = NULL, return.thresh=0.05)
    #Create heatmap with top10 marker genes
    top10 = seu.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)
    top10 = as.data.frame(top10)
    top10  = top10 [!duplicated(top10$gene),]
    top10 = top10 %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
    top10 = as.data.frame(top10)
    file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("ADT.marker_res", resol, ".png", sep='')
    p <- DoHeatmap(seu, features = top10$gene, disp.min = -2.5, disp.max = 2.5, size = 8)
    ggsave(file = file.name, plot = p, device="png", units="in", dpi = 300,
           width = 20, height = 20, limitsize=FALSE)
    #Output marker gene table
    seu.markers$cluster = as.numeric(seu.markers$cluster)
    seu.markers = seu.markers %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
    seu.markers = as.data.frame(seu.markers)
    file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("ADT.ALLmarkers_minpct0.1_Adj_p0.05.txt", sep='')
    fwrite(seu.markers, file.name, row.names=F, col.names=T, sep="\t", quote=F)
    file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("ADT.ALLmarkers_minpct0.1_Adj_p0.05.rda", sep='')
    save(seu.markers, file=file.name)

    #Output Seurat object in initial analysis
    file.name=paste(sample.name, ".wnn.res", resol, ".rda", sep='')
    save(seu, file=file.name) 
}

“'Seurat_plots_2nd' already exists”
Regressing out nCount_RNA

Centering and scaling data matrix

“The following 4 features requested have zero variance (running reduction without them): AQP9, CCL17, IL33, EPX”
PC_ 1 
Positive:  GZMH, GZMB, FCGR3A, GNLY, NKG7, LGALS1, LAIR2, ITGAM, KLRF1, IFNG 
	   B3GAT1, CCL5, ZNF683, CCL4, CX3CR1, KLRC3, CST7, TBX21, PRF1, CCL3 
	   CTSW, HLA-DPA1, CD63, ITGB2, GZMA, CD300A, HLA-DRA, APOBEC3G, CD244, LYN 
Negative:  IL7R, GZMK, TCF7, KLRB1, PIK3IP1, DUSP1, MYC, DUSP2, LTB, JUNB 
	   BTG1, CXCR4, ZBTB16, FOSB, TNFRSF25, CD27, CD28, JUN, CD69, DPP4 
	   CCR7, LEF1, RORC, CD44, HLA-A, CD48, IL18RAP, TRAT1, PASK, FTH1 
PC_ 2 
Positive:  TYMS, TOP2A, LEF1, MCM4, UBE2C, PASK, HLA-DRA, MKI67, AURKB, SELL 
	   MCM2, CXCR3, CTLA4, ICOS, LGALS9, HMMR, CD8B, PTTG2, CCR7, LGALS1 
	   YBX3, BTLA, CD27, PCNA, ZNF683, RGS1, HLA-DMA, TRIB2, ADA, CD79A 
Negative:  KLRB1, CST7, NKG7, BTG1, GZMA, GNLY, CXCR4, DUSP1, KLRG1, ARL4C 
	   PRF1, ZBTB16, KLRF1, RUNX3, CTSW, 

Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 12533
Number of edges: 399877

Running smart local moving algorithm...
Maximum modularity in 10 random starts: 0.9033
Number of communities: 19
Elapsed time: 8 seconds


6 singletons identified. 13 final clusters.

“The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session”
14:36:46 UMAP embedding parameters a = 0.9922 b = 1.112

14:36:46 Commencing smooth kNN distance calibration using 1 thread

14:36:47 Initializing from normalized Laplacian + noise

14:36:47 Commencing optimization for 200 epochs, with 389854 positive edges

14:36:52 Optimization finished

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in P

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostSc

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
Calculating cluster 0

Calculating cluster 1

Calculating cluster 2

Calculating cluster 3

Calculat

Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 17588
Number of edges: 445472

Running smart local moving algorithm...
Maximum modularity in 10 random starts: 0.8769
Number of communities: 22
Elapsed time: 12 seconds


10 singletons identified. 12 final clusters.

14:45:36 UMAP embedding parameters a = 0.9922 b = 1.112

14:45:36 Commencing smooth kNN distance calibration using 1 thread

14:45:37 Initializing from normalized Laplacian + noise

14:45:37 Commencing optimization for 200 epochs, with 539736 positive edges

14:45:44 Optimization finished

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“fo

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostSc

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
Calculating cluster 0

Calculating cluster 1

Calculating cluster 2

Calculating cluster 3

Calculating cluster 4

Calculating cluster 5

Calculating cluster 6

Calculating cluster 7

Calculating cluster 8

Calculating cluster 9

Calculating cluster 10

Calculating cluster 11

Ca

In [58]:
#Check marker genes for lineage using RNA expression
sample.names <- c("COVID_merged_CD4.wnn.res0.5", "COVID_merged_CD8.wnn.res0.5")
tmp_rna = c("TRBC2", "TRDC", "CD14", "MS4A1", "KLRC1", "CHI3L1")
tmp_adt = c("CD3", "CD19", "CD16", "CD14", "CD56", "CD161")
dir.name="MarkerGene.Plot_2nd"

########################## Processing layer #############################
dir.create(dir.name)
for(i in sample.names){
    load(str_c(i, "rda", sep = "."))
    
    ##Plotting RNA expression
    DefaultAssay(seu) <- "RNA"
    #Scatter plot
    tmp_name <- paste(i, "Scatter.RNA.png", sep = ".")
    file.name = paste(dir.name, tmp_name, sep='/')
    png(file.name, width = 1536, height = 240)
    p <- FeaturePlot(seu, features = tmp_rna, ncol = 6, order = TRUE,
                     reduction = "wnn.umap", dims=c(1,2), cols = c("grey", "red"), pt.size = 0.2)
    plot(p)
    dev.off()    
    #Violin plot
    tmp_name <- paste(i, "Violin.RNA.png", sep = ".")
    file.name = paste(dir.name, tmp_name, sep='/')
    png(file.name, width = 1536, height = 240)
    p <- VlnPlot(seu, features = tmp_rna, ncol = 6, pt.size = 0.1)
    plot(p)
    dev.off()
    
    ##Plotting Protein expression
    DefaultAssay(seu) <- "ADT"
    #Scatter plot
    tmp_name <- paste(i, "Scatter.ADT.png", sep = ".")
    file.name = paste(dir.name, tmp_name, sep='/')
    png(file.name, width = 1536, height = 240)
    p <- FeaturePlot(seu, features = tmp_adt, ncol = 6, order = TRUE,
                     reduction = "wnn.umap", dims=c(1,2), cols = c("grey", "red"), pt.size = 0.2)
    plot(p)
    dev.off()    
    #Violin plot
    tmp_name <- paste(i, "Violin.ADT.png", sep = ".")
    file.name = paste(dir.name, tmp_name, sep='/')
    png(file.name, width = 1536, height = 240)
    p <- VlnPlot(seu, features = tmp_adt, ncol = 6, pt.size = 0.1)
    plot(p)
    dev.off()
}


“'MarkerGene.Plot_2nd' already exists”


### 3rd analysis for CD8+ T cells with removing contaminant non-abT cells

In [59]:
load("COVID_merged_CD8.wnn.res0.5.rda")

###Remove contaminant cluster and repeat Seurat pipelines, define cluster resolution

###Input layer 
sample.name <- "COVID_merged_CD8_3rd"
dir.name <- "Seurat_plots_3rd"

#Choose subcluster for analyse
#Exclude cluster 16 (contaminant of myeoid cells)
DefaultAssay(seu) <- "RNA"
seu <- subset(seu, idents = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))

#For defining clustering resolution
resolutions = c(0.1, 0.2, 0.3, 0.4, 0.5)
resols.name = "minute"

########################## Processing layer #############################
dir.create(dir.name)

##For Abseq, Normalize ~ PCA
DefaultAssay(seu) <- 'ADT'
#Normalizing and scaling data
seu <- NormalizeData(seu, normalization.method = "CLR", margin = 2, assay = "ADT")
#Exclude markers from Variable features that were used in CD4/CD8 separation and contami detection
ADT_list <- rownames(seu[["ADT"]])
VariableFeatures(seu) <- ADT_list[-which(ADT_list %in% c("CD4", "CD8", "CD19", "IgD", "IgM"))] 
seu = ScaleData(object = seu) 
#Perform PCA：
seu = RunPCA(object = seu, reduction.name = 'apca')

##For RNA
DefaultAssay(seu) <- "RNA"
#Normalizing and scaling data
seu = NormalizeData(object = seu, scale.factor=1000000)
VariableFeatures(seu) <- rownames(seu[["RNA"]]) #Use all genes in target panel for Variable features
ngenes <- length(x = seu@assays$RNA@var.features)
all.genes <- rownames(seu)
seu = ScaleData(object = seu, vars.to.regress = c("nCount_RNA"), features = all.genes) 
#Perform PCA
seu = RunPCA(object = seu, features = seu@assays$RNA@var.features, npcs = 100)
#JackStraw
seu = JackStraw(object = seu, num.replicate = 100, dims = 50)
seu <- ScoreJackStraw(object = seu, dims = 1:50, score.thresh = 0.05)
file.name <- str_c(dir.name, sample.name, sep='/') %>% str_c("Jackstraw.png", sep='.')
png(file.name, width = 1250, height = 500)
JackStrawPlot(object = seu, dims = 1:50)
dev.off()
#Determine PCs used for clustering/tSNE analysis (dims.use)
#Extract PCs which fulfill the pvalue threshold
tmp = as.data.frame(seu@reductions$pca@jackstraw@overall.p.values)
tmp1 = tmp[tmp$Score>0.05,1]
dims= c(1:(min(tmp1)-1))
dims

#Clustering Weighted Nearest Neighbor
seu <- FindMultiModalNeighbors(
  seu, reduction.list = list("pca", "apca"), 
  dims.list = list(dims, dims))
#Dimentional reduction
seu <- RunUMAP(seu, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_")

#Calculate Shilouhette score
distance_matrix = parDist(seu@reductions$wnn.umap@cell.embeddings, method = "euclidean", threads=4)
silhouette_score = NULL
silhouette_score_mean = list()

for (i in c(1:length(resolutions))){
  #Perform clustering
  seu <- FindClusters(object = seu, graph.name = "wsnn", algorithm = 3, resolution =resolutions[i])
  silhouette = silhouette(as.numeric(seu@active.ident), dist = distance_matrix)
  silhouette = silhouette[,3]
  silhouette_score = c(silhouette_score, mean(silhouette))
  names(silhouette_score)[i]=resolutions[i]
  res <- vector()
  for (j in c(0:length(unique(seu@active.ident))-1)){
    tmp = mean(silhouette[seu@active.ident %in% j]) 
    res = c(res, tmp)
  }
  silhouette_score_mean[[i]]=res
  names(silhouette)=rownames(seu@meta.data)
  silhouette.name = sprintf("silhouette_score.res.%s", resolutions[i])
  seu =  AddMetaData(object = seu, metadata = silhouette, col.name = silhouette.name)
}

max_silhouette = names(silhouette_score[max(silhouette_score)])

x = data.frame(resolution = resolutions,
               mean_silhouette_score = silhouette_score)

p_silhouette = ggplot(x, aes(x = resolution, y = mean_silhouette_score)) +
  geom_bar(stat = "identity") +
  ggtitle("Mean silhouette scores of clustering") +
  theme_linedraw() + 
  theme(plot.title=element_text(hjust = 0.5), text=element_text(size=12)) + 
  theme(axis.text.x=element_text(size=12), axis.text.y=element_text(size=12)) 
file.name=paste(sample.name, resols.name, "_silhouette.png", sep='')
file.name=paste(dir.name, file.name, sep='/')
ggsave(file = file.name, plot = p_silhouette, device="png", units="in", dpi = 300,
       width = 4, height = 3, limitsize=FALSE)

In [69]:
#Clustering using RNA and ADTinformation 

##Input layer
dir.name=("Seurat_plots_3rd/")
sample.name <- "COVID_merged_CD8_3rd"
resol <- 0.1 # resolution for clustering

########################## Processing layer ###############################
#Clustering Weighted Nearest Neighbor
seu <- FindClusters(object = seu, graph.name = "wsnn", algorithm = 3, resolution =resol)

#Visualization using UMAP
p1 = DimPlot(object = seu, reduction = "wnn.umap", label = TRUE, label.size = 10, pt.size = 0.5) +
  theme(axis.title.x = element_text(size=10, family = "Arial"), 
        axis.title.y = element_text(size=10, family = "Arial"), 
        axis.text.x = element_text(size=10, colour = 1, family = "Arial"), 
        axis.text.y = element_text(size = 10, colour = 1, family = "Arial")) +
  theme(panel.border = element_rect(fill = NA, size = 1)) 
p2 = DimPlot(object = seu, reduction = "wnn.umap", label = FALSE, label.size = 10, pt.size = 0.5, group.by = "orig.ident") +
  theme(axis.title.x = element_text(size=10, family = "Arial"),
        axis.title.y = element_text(size=10, family = "Arial"),
        axis.text.x = element_text(size=10, colour = 1, family = "Arial"),
        axis.text.y = element_text(size = 10, colour = 1, family = "Arial")) +
  theme(panel.border = element_rect(fill = NA, size = 1)) 

legend1 <- cowplot::get_legend(p1)
legend2 <- cowplot::get_legend(p2)
p1 = p1 + theme(legend.position = 'none')
p2 = p2 + theme(legend.position = 'none')
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("wnn.umap_reso", resol, ".png", sep='')
save_plot(file = file.name, plot_grid(p1, legend1, p2, legend2, ncol=2, nrow=2), device="png", 
          units="in", dpi = 600, base_width = 10, base_height = 10, limitsize=FALSE)

Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 12392
Number of edges: 397052

Running smart local moving algorithm...
Maximum modularity in 10 random starts: 0.9593
Number of communities: 12
Elapsed time: 10 seconds


6 singletons identified. 6 final clusters.

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font databa

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostSc

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”


In [70]:
##Extract marker genes for RNA and ADT

###Input layer
dir.name <- "Seurat_plots_3rd"
sample.name <- "COVID_merged_CD8_3rd"

########################## Processing layer #############################
dir.create(dir.name)

##For RNA
DefaultAssay(seu) <- 'RNA'
seu.markers = FindAllMarkers(seu, verbose = TRUE, test.use="wilcox", only.pos=TRUE, min.pct=0.1, features.use = NULL, return.thresh=0.05)
#Create heatmap with top10 marker genes
top10 = seu.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)
top10 = as.data.frame(top10)
top10  = top10 [!duplicated(top10$gene),]
top10 = top10 %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
top10 = as.data.frame(top10)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("RNA.marker_res", resol, ".png", sep='')
p <- DoHeatmap(seu, features = top10$gene, disp.min = -2.5, disp.max = 2.5, size = 8)
ggsave(file = file.name, plot = p, device="png", units="in", dpi = 300,
       width = 20, height = 20, limitsize=FALSE)
#Output marker gene table
seu.markers$cluster = as.numeric(seu.markers$cluster)
seu.markers = seu.markers %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
seu.markers = as.data.frame(seu.markers)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("RNA.ALLmarkers_minpct0.1_Adj_p0.05.txt", sep='')
fwrite(seu.markers, file.name, row.names=F, col.names=T, sep="\t", quote=F)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("RNA.ALLmarkers_minpct0.1_Adj_p0.05.rda", sep='')
save(seu.markers, file=file.name)

##For ADT
DefaultAssay(seu) <- 'ADT'
seu.markers = FindAllMarkers(seu, verbose = TRUE, test.use="wilcox", only.pos=TRUE, min.pct=0.1, features.use = NULL, return.thresh=0.05)
#Create heatmap with top10 marker genes
top10 = seu.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)
top10 = as.data.frame(top10)
top10  = top10 [!duplicated(top10$gene),]
top10 = top10 %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
top10 = as.data.frame(top10)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("ADT.marker_res", resol, ".png", sep='')
p <- DoHeatmap(seu, features = top10$gene, disp.min = -2.5, disp.max = 2.5, size = 8)
ggsave(file = file.name, plot = p, device="png", units="in", dpi = 300,
       width = 20, height = 20, limitsize=FALSE)
#Output marker gene table
seu.markers$cluster = as.numeric(seu.markers$cluster)
seu.markers = seu.markers %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
seu.markers = as.data.frame(seu.markers)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("ADT.ALLmarkers_minpct0.1_Adj_p0.05.txt", sep='')
fwrite(seu.markers, file.name, row.names=F, col.names=T, sep="\t", quote=F)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("ADT.ALLmarkers_minpct0.1_Adj_p0.05.rda", sep='')
save(seu.markers, file=file.name)

#Output Seurat object in initial analysis
file.name=paste(sample.name, ".wnn.res", resol, ".rda", sep='')
save(seu, file=file.name)

Calculating cluster 0

Calculating cluster 1

Calculating cluster 2

Calculating cluster 3

Calculating cluster 4

Calculating cluster 5

Calculating cluster 0

Calculating cluster 1

Calculating cluster 2

Calculating cluster 3

Calculating cluster 4

Calculating cluster 5



### 3rd analysis for CD4+ T cells with removing contaminant non-abT cells

In [74]:
load("COVID_merged_CD4.wnn.res0.5.rda")

###Remove contaminant cluster and repeat Seurat pipelines, define cluster resolution

###Input layer 
sample.name <- "COVID_merged_CD4_3rd"
dir.name <- "Seurat_plots_3rd"

#Choose subcluster for analyse
#Exclude cluster 16 (contaminant of myeoid cells)
DefaultAssay(seu) <- "RNA"
seu <- subset(seu, idents = c(0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11))

#For defining clustering resolution
resolutions = c(0.1, 0.15, 0.2, 0.3, 0.4, 0.5)
resols.name = "minute"

########################## Processing layer #############################
dir.create(dir.name)

##For Abseq, Normalize ~ PCA
DefaultAssay(seu) <- 'ADT'
#Normalizing and scaling data
seu <- NormalizeData(seu, normalization.method = "CLR", margin = 2, assay = "ADT")
#Exclude markers from Variable features that were used in CD4/CD8 separation and contami detection
ADT_list <- rownames(seu[["ADT"]])
VariableFeatures(seu) <- ADT_list[-which(ADT_list %in% c("CD4", "CD8", "CD19", "IgD", "IgM"))] 
seu = ScaleData(object = seu) 
#Perform PCA：
seu = RunPCA(object = seu, reduction.name = 'apca')

##For RNA
DefaultAssay(seu) <- "RNA"
#Normalizing and scaling data
seu = NormalizeData(object = seu, scale.factor=1000000)
VariableFeatures(seu) <- rownames(seu[["RNA"]]) #Use all genes in target panel for Variable features
ngenes <- length(x = seu@assays$RNA@var.features)
all.genes <- rownames(seu)
seu = ScaleData(object = seu, vars.to.regress = c("nCount_RNA"), features = all.genes) 
#Perform PCA
seu = RunPCA(object = seu, features = seu@assays$RNA@var.features, npcs = 100)
#JackStraw
seu = JackStraw(object = seu, num.replicate = 100, dims = 50)
seu <- ScoreJackStraw(object = seu, dims = 1:50, score.thresh = 0.05)
file.name <- str_c(dir.name, sample.name, sep='/') %>% str_c("Jackstraw.png", sep='.')
png(file.name, width = 1250, height = 500)
JackStrawPlot(object = seu, dims = 1:50)
dev.off()
#Determine PCs used for clustering/tSNE analysis (dims.use)
#Extract PCs which fulfill the pvalue threshold
tmp = as.data.frame(seu@reductions$pca@jackstraw@overall.p.values)
tmp1 = tmp[tmp$Score>0.05,1]
dims= c(1:(min(tmp1)-1))
dims

#Clustering Weighted Nearest Neighbor
seu <- FindMultiModalNeighbors(
  seu, reduction.list = list("pca", "apca"), 
  dims.list = list(dims, dims))
#Dimentional reduction
seu <- RunUMAP(seu, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_")

#Calculate Shilouhette score
distance_matrix = parDist(seu@reductions$wnn.umap@cell.embeddings, method = "euclidean", threads=4)
silhouette_score = NULL
silhouette_score_mean = list()

for (i in c(1:length(resolutions))){
  #Perform clustering
  seu <- FindClusters(object = seu, graph.name = "wsnn", algorithm = 3, resolution =resolutions[i])
  silhouette = silhouette(as.numeric(seu@active.ident), dist = distance_matrix)
  silhouette = silhouette[,3]
  silhouette_score = c(silhouette_score, mean(silhouette))
  names(silhouette_score)[i]=resolutions[i]
  res <- vector()
  for (j in c(0:length(unique(seu@active.ident))-1)){
    tmp = mean(silhouette[seu@active.ident %in% j]) 
    res = c(res, tmp)
  }
  silhouette_score_mean[[i]]=res
  names(silhouette)=rownames(seu@meta.data)
  silhouette.name = sprintf("silhouette_score.res.%s", resolutions[i])
  seu =  AddMetaData(object = seu, metadata = silhouette, col.name = silhouette.name)
}

max_silhouette = names(silhouette_score[max(silhouette_score)])

x = data.frame(resolution = resolutions,
               mean_silhouette_score = silhouette_score)

p_silhouette = ggplot(x, aes(x = resolution, y = mean_silhouette_score)) +
  geom_bar(stat = "identity") +
  ggtitle("Mean silhouette scores of clustering") +
  theme_linedraw() + 
  theme(plot.title=element_text(hjust = 0.5), text=element_text(size=12)) + 
  theme(axis.text.x=element_text(size=12), axis.text.y=element_text(size=12)) 
file.name=paste(sample.name, resols.name, "_silhouette.png", sep='')
file.name=paste(dir.name, file.name, sep='/')
ggsave(file = file.name, plot = p_silhouette, device="png", units="in", dpi = 300,
       width = 4, height = 3, limitsize=FALSE)

“'Seurat_plots_3rd' already exists”
Normalizing across cells

Centering and scaling data matrix

“You're computing too large a percentage of total singular values, use a standard svd instead.”
“did not converge--results might be invalid!; try increasing work or maxit”
“Requested number is larger than the number of available items (25). Setting to 25.”
“Requested number is larger than the number of available items (25). Setting to 25.”
“Requested number is larger than the number of available items (25). Setting to 25.”
“Requested number is larger than the number of available items (25). Setting to 25.”
“Requested number is larger than the number of available items (25). Setting to 25.”
PC_ 1 
Positive:  CCR7, CD272, CD27, CD3, CXCR5, CD62L, CD278, CD183, CD28, Tim3 
	   CD279, CD127 
Negative:  GITR, CD196, CXCR6, CD11c, CD25, CD16, CD161, HLA-DR, CD14, CD56 
	   CD137, CD134 
PC_ 2 
Positive:  CD127, CD161, CD196, CD28, CD16, CXCR5, CCR7, CXCR6, CD183, CD11c 
	   CD3, CD56 
Negative:  

Calculating cell-specific modality weights

Finding 20 nearest neighbors for each modality.

Calculating kernel bandwidths

Finding multimodal neighbors

Constructing multimodal KNN graph

Constructing multimodal SNN graph

00:29:22 UMAP embedding parameters a = 0.9922 b = 1.112

00:29:22 Commencing smooth kNN distance calibration using 1 thread

00:29:23 Initializing from normalized Laplacian + noise

00:29:23 Commencing optimization for 200 epochs, with 512498 positive edges

00:29:30 Optimization finished



Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 16640
Number of edges: 454953

Running smart local moving algorithm...
Maximum modularity in 10 random starts: 0.9395
Number of communities: 15
Elapsed time: 17 seconds


11 singletons identified. 4 final clusters.



Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 16640
Number of edges: 454953

Running smart local moving algorithm...
Maximum modularity in 10 random starts: 0.9275
Number of communities: 17
Elapsed time: 14 seconds


11 singletons identified. 6 final clusters.



Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 16640
Number of edges: 454953

Running smart local moving algorithm...
Maximum modularity in 10 random starts: 0.9190
Number of communities: 18
Elapsed time: 14 seconds


11 singletons identified. 7 final clusters.



Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 16640
Number of edges: 454953

Running smart local moving algorithm...
Maximum modularity in 10 random starts: 0.9044
Number of communities: 21
Elapsed time: 13 seconds


11 singletons identified. 10 final clusters.



Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 16640
Number of edges: 454953

Running smart local moving algorithm...
Maximum modularity in 10 random starts: 0.8930
Number of communities: 22
Elapsed time: 13 seconds


11 singletons identified. 11 final clusters.



Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 16640
Number of edges: 454953

Running smart local moving algorithm...
Maximum modularity in 10 random starts: 0.8823
Number of communities: 24
Elapsed time: 12 seconds


11 singletons identified. 13 final clusters.



In [75]:
#Clustering using RNA and ADTinformation 

##Input layer
dir.name=("Seurat_plots_3rd/")
sample.name <- "COVID_merged_CD4_3rd"
resol <- 0.1 # resolution for clustering

########################## Processing layer ###############################
#Clustering Weighted Nearest Neighbor
seu <- FindClusters(object = seu, graph.name = "wsnn", algorithm = 3, resolution =resol)

#Visualization using UMAP
p1 = DimPlot(object = seu, reduction = "wnn.umap", label = TRUE, label.size = 10, pt.size = 0.5) +
  theme(axis.title.x = element_text(size=10, family = "Arial"), 
        axis.title.y = element_text(size=10, family = "Arial"), 
        axis.text.x = element_text(size=10, colour = 1, family = "Arial"), 
        axis.text.y = element_text(size = 10, colour = 1, family = "Arial")) +
  theme(panel.border = element_rect(fill = NA, size = 1)) 
p2 = DimPlot(object = seu, reduction = "wnn.umap", label = FALSE, label.size = 10, pt.size = 0.5, group.by = "orig.ident") +
  theme(axis.title.x = element_text(size=10, family = "Arial"),
        axis.title.y = element_text(size=10, family = "Arial"),
        axis.text.x = element_text(size=10, colour = 1, family = "Arial"),
        axis.text.y = element_text(size = 10, colour = 1, family = "Arial")) +
  theme(panel.border = element_rect(fill = NA, size = 1)) 

legend1 <- cowplot::get_legend(p1)
legend2 <- cowplot::get_legend(p2)
p1 = p1 + theme(legend.position = 'none')
p2 = p2 + theme(legend.position = 'none')
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("wnn.umap_reso", resol, ".png", sep='')
save_plot(file = file.name, plot_grid(p1, legend1, p2, legend2, ncol=2, nrow=2), device="png", 
          units="in", dpi = 600, base_width = 10, base_height = 10, limitsize=FALSE)

Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 16640
Number of edges: 454953

Running smart local moving algorithm...
Maximum modularity in 10 random starts: 0.9395
Number of communities: 15
Elapsed time: 17 seconds


11 singletons identified. 4 final clusters.

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font datab

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostSc

“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostSc

In [76]:
##Extract marker genes for RNA and ADT

###Input layer
dir.name <- "Seurat_plots_3rd"
sample.name <- "COVID_merged_CD4_3rd"

########################## Processing layer #############################
dir.create(dir.name)

##For RNA
DefaultAssay(seu) <- 'RNA'
seu.markers = FindAllMarkers(seu, verbose = TRUE, test.use="wilcox", only.pos=TRUE, min.pct=0.1, features.use = NULL, return.thresh=0.05)
#Create heatmap with top10 marker genes
top10 = seu.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)
top10 = as.data.frame(top10)
top10  = top10 [!duplicated(top10$gene),]
top10 = top10 %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
top10 = as.data.frame(top10)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("RNA.marker_res", resol, ".png", sep='')
p <- DoHeatmap(seu, features = top10$gene, disp.min = -2.5, disp.max = 2.5, size = 8)
ggsave(file = file.name, plot = p, device="png", units="in", dpi = 300,
       width = 20, height = 20, limitsize=FALSE)
#Output marker gene table
seu.markers$cluster = as.numeric(seu.markers$cluster)
seu.markers = seu.markers %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
seu.markers = as.data.frame(seu.markers)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("RNA.ALLmarkers_minpct0.1_Adj_p0.05.txt", sep='')
fwrite(seu.markers, file.name, row.names=F, col.names=T, sep="\t", quote=F)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("RNA.ALLmarkers_minpct0.1_Adj_p0.05.rda", sep='')
save(seu.markers, file=file.name)

##For ADT
DefaultAssay(seu) <- 'ADT'
seu.markers = FindAllMarkers(seu, verbose = TRUE, test.use="wilcox", only.pos=TRUE, min.pct=0.1, features.use = NULL, return.thresh=0.05)
#Create heatmap with top10 marker genes
top10 = seu.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)
top10 = as.data.frame(top10)
top10  = top10 [!duplicated(top10$gene),]
top10 = top10 %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
top10 = as.data.frame(top10)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("ADT.marker_res", resol, ".png", sep='')
p <- DoHeatmap(seu, features = top10$gene, disp.min = -2.5, disp.max = 2.5, size = 8)
ggsave(file = file.name, plot = p, device="png", units="in", dpi = 300,
       width = 20, height = 20, limitsize=FALSE)
#Output marker gene table
seu.markers$cluster = as.numeric(seu.markers$cluster)
seu.markers = seu.markers %>% arrange(desc(avg_log2FC))  %>% arrange(cluster)
seu.markers = as.data.frame(seu.markers)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("ADT.ALLmarkers_minpct0.1_Adj_p0.05.txt", sep='')
fwrite(seu.markers, file.name, row.names=F, col.names=T, sep="\t", quote=F)
file.name <- str_c(dir.name, sample.name, sep = "/") %>% str_c("ADT.ALLmarkers_minpct0.1_Adj_p0.05.rda", sep='')
save(seu.markers, file=file.name)

#Output Seurat object in initial analysis
file.name=paste(sample.name, ".wnn.res", resol, ".rda", sep='')
save(seu, file=file.name)

“'Seurat_plots_3rd' already exists”
Calculating cluster 0

Calculating cluster 1

Calculating cluster 2

Calculating cluster 3

Calculating cluster 0

Calculating cluster 1

Calculating cluster 2

Calculating cluster 3



In [14]:
### Check marker genes for lineage (CD4 and CD8 dataset)

In [77]:
#Check marker genes for lineage using RNA expression
sample.names <- c("COVID_merged_CD4_3rd.wnn.res0.1", "COVID_merged_CD8_3rd.wnn.res0.1")
tmp_rna = c("TRBC2", "TRDC", "CD14", "MS4A1", "KLRC1", "CHI3L1")
tmp_adt = c("CD3", "CD19", "CD16", "CD14", "CD56", "CD161")
dir.name="MarkerGene.Plot_3rd"

########################## Processing layer #############################
dir.create(dir.name)
for(i in sample.names){
    load(str_c(i, "rda", sep = "."))
    
    ##Plotting RNA expression
    DefaultAssay(seu) <- "RNA"
    #Scatter plot
    tmp_name <- paste(i, "Scatter.RNA.png", sep = ".")
    file.name = paste(dir.name, tmp_name, sep='/')
    png(file.name, width = 1536, height = 240)
    p <- FeaturePlot(seu, features = tmp_rna, ncol = 6, order = TRUE,
                     reduction = "wnn.umap", dims=c(1,2), cols = c("grey", "red"), pt.size = 0.2)
    plot(p)
    dev.off()    
    #Violin plot
    tmp_name <- paste(i, "Violin.RNA.png", sep = ".")
    file.name = paste(dir.name, tmp_name, sep='/')
    png(file.name, width = 1536, height = 240)
    p <- VlnPlot(seu, features = tmp_rna, ncol = 6, pt.size = 0.1)
    plot(p)
    dev.off()
    
    ##Plotting Protein expression
    DefaultAssay(seu) <- "ADT"
    #Scatter plot
    tmp_name <- paste(i, "Scatter.ADT.png", sep = ".")
    file.name = paste(dir.name, tmp_name, sep='/')
    png(file.name, width = 1536, height = 240)
    p <- FeaturePlot(seu, features = tmp_adt, ncol = 6, order = TRUE,
                     reduction = "wnn.umap", dims=c(1,2), cols = c("grey", "red"), pt.size = 0.2)
    plot(p)
    dev.off()    
    #Violin plot
    tmp_name <- paste(i, "Violin.ADT.png", sep = ".")
    file.name = paste(dir.name, tmp_name, sep='/')
    png(file.name, width = 1536, height = 240)
    p <- VlnPlot(seu, features = tmp_adt, ncol = 6, pt.size = 0.1)
    plot(p)
    dev.off()
}

In [14]:
### Integrationa of scTCRseq results into SCT Seurat object

In [78]:
###Define function
#Convert MiXCR output into VDJtools format
Convert <- function(file.name, name.TRA){
    name.input <- str_c(name.TRA, file.name, sep = "/")
    data <- tableread_fast(name.input, header = TRUE, sep = '\t')

    #Extract TCR information from mixcr output
    count.total<-data$cloneCount / data$cloneFraction
    freq<-data$cloneFraction
    cdr3nt<-data$nSeqImputedCDR3
    cdr3aa<-data$aaSeqImputedCDR3
    v<-str_sub(data$bestVHit, end=-4)
    d<-str_sub(data$bestDHit, end=-4)
    j<-str_sub(data$bestJHit, end=-4)
    data3 <- rbind(count.total,freq,cdr3nt,cdr3aa,v,d,j)
    data3 <- as.data.frame(t(data3))
    names(data3) <- c("count.total","freq","cdr3nt","cdr3aa","v","d","j")

    #Extract largest clone in cell barcode and append cell barcode information
    name_out <- str_split(file.name, "_")
    CB <- name_out[[1]][[4]]
    CB_out <- str_split(CB, "\\.")
    CB <- CB_out[[1]][[1]]
    d_out <- data3[1,]
    d_out$CB <- CB

    return(d_out)
}

In [79]:
###Define function
#Create histogram was showing the total number of TCR reads detected in each cell (top left) 
#and the percentage of TCR reads that were the most common among them (bottom left).
#Scatter plot representing these two metrices of cells were also generated
Histogram <- function(d, file.name, sample.name, dir.output){

  d_count_total <- as.numeric(d$count.total)
  
  #Count histogram
  ppi <- 300
  image.file <- str_c(dir.output, file.name, sep = "/") %>% str_c(sample.name, "histogram.count.tiff", sep = '.') 
  tiff(image.file, width=1.2*ppi, height=0.8*ppi, res=ppi)
  p <- ggplot(NULL, aes(x=d_count_total)) +
    geom_histogram(binwidth=0.1) +
    theme_bw(base_size = 6) +
    labs(x = "Read counts of TCR") +
    theme(
      axis.title.y=element_blank(),
      axis.text.x = element_text(family="Arial"),
      axis.text.y = element_text(family="Arial"),
      axis.title=element_text(size=4)) +
    scale_x_continuous(trans=scales::log2_trans(),
                   breaks=scales::trans_breaks("log2",function(x) 2^x),
                   labels=scales::trans_format("log2",scales::math_format(2^.x)))
  print(p)
  dev.off()

  #Frequency histogram  
  d_freq <- as.numeric(d$freq)
  med_f <- 0.3
  ppi <- 300
  image.file <- str_c(dir.output, file.name, sep = "/") %>% str_c(sample.name, "histogram.freq.tiff", sep = '.')
  tiff(image.file, width=1.2*ppi, height=0.8*ppi, res=ppi)
  p <- ggplot(NULL, aes(x=d_freq)) +
    geom_histogram(binwidth=0.05) +
    theme_bw(base_size = 6) +
    labs(x = "Proportion of the largest TCR read") +
    theme(
      axis.title.y=element_blank(),
      axis.text.x = element_text(family="Arial"),
      axis.text.y = element_text(family="Arial"),
      axis.title=element_text(size=4)) 
  print(p)
  dev.off()

  df <- data.frame(d_count_total, d_freq)
  
  #Scatter plot    
  total <- nrow(subset(df, df$d_freq >= 0))
  lb <- paste(round(100*nrow( subset(df, df$d_freq < 0.6 & df$d_count_total < 2^5 )) / total, digits = 1), "%", sep="" ) 
  rb <- paste(round(100*nrow( subset(df, df$d_freq >= 0.6 & df$d_count_total < 2^5 )) / total, digits = 1), "%", sep="" ) 
  lt <- paste(round(100*nrow( subset(df, df$d_freq < 0.6 & df$d_count_total >= 2^5 )) / total, digits = 1), "%", sep="" ) 
  rt <- paste(round(100*nrow( subset(df, df$d_freq >= 0.6 & df$d_count_total >= 2^5 )) / total, digits = 1), "%", sep="" ) 
  
  ppi <- 300
  image.file <- str_c(dir.output, file.name, sep = "/") %>% str_c(sample.name, "Scatter.tiff", sep = '.')                     
  tiff(image.file, width=1.2*ppi, height=1.2*ppi, res=ppi)
  p <- ggplot(d, aes(x=d_freq, y=d_count_total)) +  
    stat_bin2d(bins=60) +
    scale_fill_gradient(low="lightblue", high="red") +
    theme_bw(base_size = 6) +
    geom_vline(aes(xintercept = 0.6), size=0.25, colour="black") +
    geom_hline(aes(yintercept = 2^5), size=0.25, colour="black") +
    labs(x = "Proportion of the largest TCR read", y = "Read counts of TCR") +
    theme(
      axis.text.x = element_text(family="Arial"),
      axis.text.y = element_text(family="Arial"),
      axis.title=element_text(size=4)) +
    guides(fill=FALSE) +
    scale_y_continuous(trans=scales::log2_trans(),
                   breaks=scales::trans_breaks("log2",function(x) 2^x),
                   labels=scales::trans_format("log2",scales::math_format(2^.x)))
    p <- p + #xlim(0,1) +
    annotate("text", x=-Inf, y=0, hjust=-0.1, vjust=-0.4, label=lb,
             family="Arial",colour="black",size=1.5) +
    annotate("text", x=Inf, y=0, hjust=1.1, vjust=-0.4, label=rb,
             family="Arial",colour="black",size=1.5) +
    annotate("text", x=-Inf, y=Inf, hjust=-0.1, vjust=1.3, label=lt,
             family="Arial",colour="black",size=1.5) +
    annotate("text", x=Inf, y=Inf, hjust=1.1, vjust=1.3, label=rt,
             family="Arial",colour="black",size=1.5) 
  print(p)
  dev.off()  
}

In [95]:
#Extract TCR sequence for each cel barcode
#iteration process for dataset (CD4/CD8 Seurat object) and clonotype (Ant/Bnt)
set.seed(seed = 42)
seurat.names <- c("COVID_merged_CD4_3rd.wnn.res0.1", "COVID_merged_CD8_3rd.wnn.res0.1")
sample.list <- c("ID02", "ID28")
dir.input.all <- "scTCR_rawdata"
cores <- 12
dir.output <- "scTCR_processing"
#Threshold for valid cell barcode with single TCR sequence
count_th <- 5 #read count threshold for all TCR sequences per cell barcode
freq_th <- 0.6 #proportion threshold for largest TCR
clonotype <- "nt" #Definition of clones. nt: nucleotide sequence / aa: amino acid sequnece 
hashtag <- FALSE #Whether hashtags are used or not.
batch <- TRUE #Whether multiple chips are used or not.

################################ Processing layer #################################################
dir.create(dir.output)


combined.tables <- data.frame()
for(sample.name in sample.list){
    #extract directory name
    dir.input <- str_c(dir.input.all, sample.name, sep = "/")
    
    #Unzip files
    files <- list.files(dir.input, ".tar.bz2")
    for(file in files){
        name.input <- str_c(dir.input, file, sep = "/")
        bunzip2(name.input, remove=FALSE)
        name.input <- str_remove(name.input, pattern = ".bz2")
        untar(name.input)
        file.remove(name.input)
    }
    
    #Convert MiXCR output into VDJtools format
    name.TRA <- str_c(sample.name, "TCR_TRAC1_mixcr", sep = "")
    files  <- list.files(name.TRA, pattern=".txt")
    cl <- makeCluster(cores)
    registerDoParallel(cl)
    TRA.table <- invisible(foreach(file.name = files,
            .combine = rbind, .packages=c("ggplot2", "extrafont", "stringr", "dplyr", "data.table")) %dopar% {Convert(file.name, name.TRA)})
    stopCluster(cl)
    unlink(name.TRA, recursive=TRUE)

    name.TRB <- str_c(sample.name, "TCR_TRBC1_mixcr", sep = "")
    files  <- list.files(name.TRB, pattern=".txt")
    cl <- makeCluster(cores)
    registerDoParallel(cl)
    TRB.table <- invisible(foreach(file.name = files,
            .combine = rbind, .packages=c("ggplot2", "extrafont", "stringr", "dplyr", "data.table")) %dopar% {Convert(file.name, name.TRB)})
    stopCluster(cl)
    unlink(name.TRB, recursive=TRUE)

    #Output histogram and scatter plot for summarizing scTCR status
    Histogram(TRA.table, "TRA", sample.name, dir.output)
    Histogram(TRB.table, "TRB", sample.name, dir.output)

    #thresholding
    TRA.table$count.total <- as.numeric(TRA.table$count.total)
    TRA.table$freq <- as.numeric(TRA.table$freq)
    TRB.table$count.total <- as.numeric(TRB.table$count.total)
    TRB.table$freq <- as.numeric(TRB.table$freq)
    TCRa_th <- subset(TRA.table, count.total >= 2^(count_th) & freq >= freq_th)
    TCRb_th <- subset(TRB.table, count.total >= 2^(count_th) & freq >= freq_th)

    #Paring TCRa and TCRb by cell barcode
    names(TCRa_th) <- c("count.total.A","freq.A","cdr3nt.A","cdr3aa.A","v.A","d.A","j.A", "CB")
    names(TCRb_th) <- c("count.total.B","freq.B","cdr3nt.B","cdr3aa.B","v.B","d.B","j.B", "CB")
    combined <- merge(TCRa_th, TCRb_th, all=T, by ="CB")
    combined$CB <- str_c(sample.name, combined$CB, sep = "_")
    #Exclude cells in which neither TCRa nor TCRb sequence were detected
    combined <- subset(combined, combined$cdr3nt.A != "UD" | combined$cdr3nt.B != "UD")

    #Generate clone id for further analysis
    #Definition of clones can be changed between "ABnt", "ABaa", "Bnt", and "Baa"
    if(clonotype=="nt"){
        combined$clone.id.TCRa <- str_c(combined$cdr3nt.A, combined$v.A, combined$j.A, sep="_")
        combined$clone.id.TCRb <- str_c(combined$cdr3nt.B, combined$v.B, combined$j.B, sep="_")
    }
    if(clonotype=="aa"){
        combined$clone.id.TCRa <- str_c(combined$cdr3aa.A, combined$v.A, combined$j.A, sep="_")
        combined$clone.id.TCRb <- str_c(combined$cdr3aa.B, combined$v.B, combined$j.B, sep="_")
    }
    combined$clone.id.TCRab <- str_c(combined$clone.id.TCRa, combined$clone.id.TCRb, sep = "_")
    name.output <- str_c(dir.output, sample.name, sep = "/") %>% str_c(clonotype, "table", "count_th", count_th, "freq_th", freq_th, clonotype, "csv", sep = ".")
    write.csv(combined, name.output, row.names = FALSE)
    
    combined.tables <- rbind(combined.tables, combined)
}

##Integrate to the Seurat object; iteration for dataset (CD4/CD8 Seurat object))
for(seurat.name in seurat.names){
    #load Seurat object
    name.input <- str_c(seurat.name, "rda", sep = ".")
    load(name.input)
    
    #Extract meta.data and cell BC information
    meta.data <- seu@meta.data
    meta.data$names <- row.names(meta.data)
    if(hashtag == TRUE){
        if(batch == FALSE){
            CB.info <- str_split(meta.data$names, pattern = "_", simplify = TRUE)
            meta.data$CB <- CB.info[,ncol(CB.info)]
        } else {
            CB.info <- str_split(meta.data$names, pattern = "_", simplify = TRUE)
            meta.data$CB <- str_c(CB.info[,1], CB.info[,3], sep = "_")
        }
    } else {
            meta.data$CB <- meta.data$names
    }

    #Merge to Seurat object of SCT data
    meta.data <- merge(combined.tables, meta.data, all.y = T, by ="CB")
    clone.ids <- c("clone.id.TCRa", "clone.id.TCRb", "clone.id.TCRab")
    for(clone.id in clone.ids){
        clone.id.list <- dplyr::select(meta.data, clone.id)
        row.names(clone.id.list)=as.character(meta.data$names)
        seu <- AddMetaData(object = seu, metadata = clone.id.list, col.name = clone.id)
    }
    
    #Output Seurat object
    name.output <- str_c(seurat.name, "scTCRmerged", clonotype, "rda", sep = ".")
    save(seu, file=name.output)
}


“'scTCR_processing' already exists”
“Removed 2538 rows containing non-finite values (stat_bin).”
“Removed 2538 rows containing non-finite values (stat_bin).”
“`guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.”
“Transformation introduced infinite values in continuous y-axis”
“Transformation introduced infinite values in continuous y-axis”
“Removed 2538 rows containing non-finite values (stat_bin2d).”
“Removed 2646 rows containing non-finite values (stat_bin).”
“Removed 2646 rows containing non-finite values (stat_bin).”
“`guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.”
“Transformation introduced infinite values in continuous y-axis”
“Transformation introduced infinite values in continuous y-axis”
“Removed 2646 rows containing non-finite values (stat_bin2d).”
“Removed 1893 rows containing non-finite values (stat_bin).”
“Removed 1893 rows containing non-finite values (stat_bin).”
“`guides(<scale> = FALSE)` is de

In [None]:
### Integrating WGCNA and AIM status of clones into scTCRseq results

In [100]:
###Define function

##Make clone table by assembling clones from scTCR results 
CloneSummary <- function(seu, dir.name, sample.name, clonotype, cell){
    #seu: Seurat object for analyze
    #dir.name: directory name for output
    dir.create(dir.name)

    #load metadata
    meta.data <- as.data.frame(seu@meta.data)
    meta.data$CellBC <- row.names(meta.data)
    #extract cells only in the assigned sample
    meta.data <- dplyr::filter(meta.data, str_detect(CellBC, sample.name))
    
    #Define clone by TCRa / TCRb/ TCRa&b
    if(clonotype == "TCRa"){
       meta.data$clone.id <- meta.data$clone.id.TCRa 
    }
    if(clonotype == "TCRb"){
       meta.data$clone.id <- meta.data$clone.id.TCRb 
    }
    if(clonotype == "TCRab"){
       meta.data$clone.id <- meta.data$clone.id.TCRab 
    } 

    #Summarize cluster distribution of each clone
    tmp_out = table(meta.data$seurat_clusters, meta.data$clone.id)
    tmp_out2 <- as.data.frame(tmp_out, row.names = NULL,
                  responseName = "Freq", stringsAsFactors = TRUE,
                  sep = "", base = list(LETTERS))
    names(tmp_out2) <- c("Clust", "names", "Freq")
    tmp_out2 <- dcast(tmp_out2, names ~ Clust)

    #Summarize the count and frequency of clones 
    tmp = meta.data %>% group_by(`clone.id`) %>%
      dplyr::summarise(count = n()) %>%
      dplyr::arrange(desc(count))
    tmp = as.data.frame(tmp)
    tmp <- tmp[!is.na(tmp$clone.id), ] #NAである細胞を除く
    tmp[,3]=tmp[,2]/sum(tmp[,2])
    colnames(tmp)=c("ntSeq_TRA_TRB_freq", "CloneCount", "CloneFreq")
    tmp <- tmp[order(tmp$ntSeq_TRA_TRB_freq),]
    temp_count <- as.vector(tmp$CloneCount)
    temp_freq <- as.vector(tmp$CloneFreq)
    #Combine to the cluster distribution
    tmp_out3 <- cbind(temp_count, temp_freq, tmp_out2)

    #Assign ranks to clones
    tmp_out3 <- tmp_out3[order(tmp_out3$temp_freq, decreasing=T),]
    rank <-  1:nrow(tmp_out3)
    tmp_out3 <- cbind(rank, tmp_out3)
    tmp_out3 <- data.frame(tmp_out3)
    tmp_out3$rank <- paste("Top",tmp_out3$rank, sep="")
    
    #Output
    name.out <- str_c(dir.name, sample.name, sep = "/") %>% str_c(cell, clonotype, "clone_within_cluster.txt", sep = ".")
    write.table(tmp_out3, name.out, row.names=F, col.names=T, sep="\t", quote=F) 
}

In [106]:
##Make clone table by assembling clones from scTCR results 

dir.name <- "scTCR_analysis"
sample.list <- c("ID02", "ID28")
seurat.names <- c("COVID_merged_CD4_3rd.wnn.res0.1.scTCRmerged.nt", "COVID_merged_CD8_3rd.wnn.res0.1.scTCRmerged.nt")

##################### Processing layer #############################################################
dir.create(dir.name)
for(seurat.name in seurat.names){
    #load Seurat object
    name.input <- str_c(seurat.name, "rda", sep = ".")
    load(name.input)
    cell <- str_split(seurat.name, pattern = "_", simplify = TRUE)[3]
    
    for(sample.name in sample.list){
    #Make clone table by assembling clones from scTCR results 
    CloneSummary(seu, dir.name, sample.name, "TCRb", cell)
    } 
}



“'scTCR_analysis' already exists”
“'scTCR_analysis' already exists”
Using Freq as value column: use value.var to override.

“'scTCR_analysis' already exists”
Using Freq as value column: use value.var to override.

“'scTCR_analysis' already exists”
Using Freq as value column: use value.var to override.

“'scTCR_analysis' already exists”
Using Freq as value column: use value.var to override.



In [107]:
###Merge SCT metadata with WGCNA pattern / AIM positivity

#Input layer
dir.input.scTCR <- "scTCR_analysis"
dir.input.wgcna <- "WGCNA_AIMcombined"
dir.output <- "scTCR_analysis"
#Information to merge into scTCR clone table
extract <- c("P1", "P2", "P3", "P4", "class.x", "class.y", "cell")

Tcells <- c("CD4", "CD8")
sample.sct.names <- c("ID02", "ID28")
sample.wgcna.names <- c("002", "027")

##################### Processing layer #############################################################
for(Tcell in Tcells){
    for(i in 1:length(sample.wgcna.names)){
    #load data
    sample.sct <- str_c(dir.input.scTCR, sample.sct.names[i], sep = "/") %>% str_c(Tcell, "TCRb.clone_within_cluster.txt", sep = ".")
    sct <- read.table(sample.sct, header = TRUE)
    sample.wgcna <- str_c(dir.input.wgcna, "COVIDAIM", sep = "/") %>%
        str_c(Tcell, sample.wgcna.names[i], "TCR.th.4.wgcna.aim.csv", sep = "_") 
    wgcna <- read.csv(sample.wgcna, header = TRUE)
  
  #Merge wgcna and sct dataset
  d_output <- merge(sct, wgcna, by.x = "names", by.y = "query", all.x = T)
  
  #Output
  name.out <- str_c(dir.output, sample.wgcna.names[i], sep = "/") %>% str_c(Tcell, "clone_within_cluster.WGCNA.AIM.csv", sep = ".")
  write.csv(d_output, name.out, row.names = FALSE)
    }
}
