/
clustering_dimensionalityreduction_pseudotime_git.R
311 lines (253 loc) · 17.4 KB
/
clustering_dimensionalityreduction_pseudotime_git.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
###############################
## INSTALL REQUIRED PACKAGES ##
###############################
install.packages("reshape2")
install.packages("ggplot2")
install.packages("uwot")
install.packages("ggrepel")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("scales")
install.packages("reshape2")
install.packages("RColorBrewer")
install.packages("devtools")
install.packages("BiocManager")
BiocManager::install("FlowSOM")
BiocManager::install("slingshot")
BiocManager::install("flowCore")
BiocManager::install("SingleCellExperiment")
library(devtools)
devtools::install_github("JinmiaoChenLab/cytofkit2")
devtools::install_github('flying-sheep/knn.covertree')
devtools::install_github('theislab/destiny')
###############################
### LOAD REQUIRED PACKAGES ####
###############################
library(flowCore)
library(FlowSOM)
library(SingleCellExperiment)
library(dplyr)
library(ggplot2)
library(scales)
library(reshape2)
library(RColorBrewer)
library(destiny)
library(uwot)
library(slingshot)
library(cytofkit2)
library(ggrepel)
############################
######## LOAD DATA #########
############################
### Load the (transformed, normalized, unclustered) FCS files from the 'CSV_to_transformed_normalized_FCS' script
### Or load fcs files which where clustered in Cytosplore (in this case each fcs file is 1 cluster)
## Provide the directory of the fcs files
dirFCS = "path_to_your_folder_with_fcs_files"
## Optional: when loading clustered fcs files from cytosplore, provide the directory of the text file 'CSPLR_ST.txt'. Cytosplore exports this file upon running the HSNE. This file contains the decoding of the sample numbers.
pathST <- "I:/Laboratoria/immlab/Janine/flowpaper/JI/rebuttal/newfigures/CD4_level2/clusters/CSPLR_ST.txt"
## Defining a function to read multiple fcs files from a directory 'dir' into a single data.frame:
# NB: The column in the output named 'fileName' tracks the original file where each cell came from.
# Optionally perform remapping of column 'CSPLR_ST' holding cytosplore sample numbers to actual names:
read.flowdat <- function(dir,path_CSPLR_ST=""){
# Read:
filepaths <- list.files(path=dir,pattern = ".fcs", full.names=TRUE)
flowset <- read.flowSet(files=filepaths, transformation=FALSE, truncate_max_range = FALSE)
# Transform to data frame:
x <- as.data.frame(exprs(as(flowset,'flowFrame')),stringsAsFactors=FALSE)
# Map column 'Original' to filename (in this case holding clusters of HSNE):
filenames <- gsub("[.fcs]","",list.files(path=dir,pattern = ".fcs", full.names=FALSE))
names(filenames) <- sort(unique(x$Original))
x$fileName <- filenames[as.character(x$Original)]
# Remove column 'Original':
x <- x[,-which(colnames(x)=="Original")]
# Optionally remap Cytosplore sample tags to original filename:
if(file.exists(path_CSPLR_ST)){
# Read:
sampID <- gsub(".fcs","",basename(sapply(strsplit(readLines(path_CSPLR_ST),": "),function(x) x[1])))
names(sampID) <- sapply(strsplit(readLines(path_CSPLR_ST),": "),function(x) x[2])
x$sampleID <- sampID[as.character(x$CSPLR_ST)]
}
return(x)
}
## Read fcs files
# In our example we will read the data which were clustered in Cytosplore (each fcs file is 1 cluster)
df <- read.flowdat(dir=dirFCS,path_CSPLR_ST = pathST)
# Optional: Set columname 'fileName' to clusters_HSNE:
colnames(df)[which(colnames(df)=="fileName")] <- "clusters_HSNE"
## In our example we will start with the 275856 CD4 T cells
############################
######## CLUSTERING ########
############################
### We will discuss 3 clustering methods:
### A) HSNE-based Gaussian Mean Shift clustering
### B) flowSOM
### C) Phenograph
## Option A: HSNE-based Gaussian Mean Shift clustering
## With the software cytosplore HSNE-based GMS clustering can be performed. The clustering results with the expression values can be exported as fcs and loaded as described above
## Option B: generate clusters by FlowSOM
#check colnames, to determine which columns you need for the cluster calculation
colnames(df)
#run flowsom
flowsom <- FlowSOM(input = dirFCS,
transform = FALSE,
scale = FALSE,
colsToUse = c(7:9, 11, 13:16,18,19), #provide the columns for the clustering
nClus = 14, #we choose 14, since we also generated 14 clusters by HSNE
seed = 100)
# Get metaclustering per cell
clusters_flowsom <- as.factor(flowsom$map$mapping[,1])
levels(clusters_flowsom) <- flowsom$metaclustering
#add flowsom clusters to dataframe
df <- cbind(df, clusters_flowsom)
## Option C: generate clusters by Phenograph (based on Louvain clustering)
# select the columns for the clustering calculation (usually the numbers are the same as used for the flowsom calculation)
#the higher the K nearest neighbours, the lower the number of clusters
phenograph <- Rphenograph(df[,c(7:9,11, 13:16,18,19)], k=50)
clusters_phenograph <- as.factor(phenograph$membership)
#add phenograph clusters to expression data frame
df <- cbind(df, clusters_phenograph)
###############################
######## VISUALIZATION ########
###############################
### We will discuss 3 visualization methods:
### A) HSNE
### B) diffusion map
### C) UMAP
## option A: HSNE
# HSNE can only be visualized in Cytosplore. Import the FCS files from the 'CSV_to_transformed_normalized_FCS' script to cytosplore to perform HSNE analysis
## Option B: diffusion map (for our example CD4 T cell dataset this will take approximately 2hours)
# reduce the K, if computational load is too high [it takes approximately 2 hours for the example dataset of 275856 cells]
dm <- DiffusionMap(df, vars = c("CD95", "CD8", "CD27", "CCR7", "CD45RA", "CD49b", "CD69", "CD103", "CD3", "CD4"), k=1000, suppress_dpt = TRUE, verbose=TRUE)
# add the diffusion components to the expression data frame (either all by dm@eigenvectors, or a selection by dm$DC1, dm$DC2, etc.)
df <- cbind(df, DC1=dm$DC1, DC2=dm$DC2, DC3=dm$DC3)
# Vizualize results:
# With the viz.dm function the two components of the diffusion map can be visualized with the expression levels of a particular parameter applied as color scale
# dat: the data frame
# param.name: the name of the parameter which you would like to plot as color scale
# if no limits are provided: for better visualization, the most extreme low and high percentile of the color values are normalized.
# if limits are provided: the color scale will be adjusted according to the provided limits (recommended for parameters which are expressed on all cells). To apply the same colorscale as in Cytosplore, provide the limits as set in Cytosplore.
viz.dm <- function(dat,dr, param.name,limits=NULL){
ColVal <- dat[,param.name]
if(is.null(limits)){
Lim <- quantile(ColVal,probs=seq(0,1,0.01))[c(2,100)]
p <- ggplot(dat, aes(x = DC1, y =DC2)) +geom_point(aes(color = ColVal), size=0.1)+theme_classic()+scale_color_distiller(name=param.name, palette = "RdYlBu", limits=Lim, oob=squish)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+ggtitle(param.name)
} else {
p <- ggplot(dat, aes(x = DC1, y = DC2)) +geom_point(aes(color = ColVal), size=0.1)+theme_classic()+scale_color_distiller(name=param.name, palette = "RdYlBu", limits=c(limits[1],limits[2]), oob=squish)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+ggtitle(param.name)
}
p
}
viz.dm(dat=df,param.name='CD27', limits=c(-0.02,3.17))
viz.dm(dat=df, param.name="CD45RA")
#visualize and label clusters on diffusion map
#first we need to determine the positions of the cluster labels, based on the DC coordinates
label_HSNE_dm <- df%>%group_by(clusters_HSNE)%>%select(DC1, DC2)%>%summarize_all(mean)
label_flowsom_dm <- df%>%group_by(clusters_flowsom)%>%select(DC1, DC2)%>%summarize_all(mean)
label_pheno_dm <- df%>%group_by(clusters_phenograph)%>%select(DC1, DC2)%>%summarize_all(mean)
ggplot(df, aes(x=DC1, y=DC2, color=clusters_HSNE))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_HSNE), data=label_HSNE_dm)+guides(colour=FALSE)
ggplot(df, aes(x=DC1, y=DC2, color=as.factor(clusters_flowsom)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_flowsom), data=label_flowsom_dm)+guides(colour=FALSE)
ggplot(df, aes(x=DC1, y=DC2, color=as.factor(clusters_phenograph)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_phenograph), data=label_pheno_dm)+guides(colour=FALSE)
## Option C: UMAP
# select the columns for the UMAP calculation
# check different n_neighbours (controls how UMAP balances local versus global structure in the data) for your UMAP plot
# check min_dist (controls how tightly UMAP is allowed to pack points together, low values=clumpier embeddings) for your UMAP plot
umap <- umap(df[,c(7:9, 11, 13:16, 18, 19)], n_neighbors = 30, min_dist=0.001, verbose=TRUE)
umap<- as.data.frame(umap)
colnames(umap) <- c('umap_1', 'umap_2')
df <- cbind(df,umap)
# Vizualize results:
# With the viz.umap function the two components of the umap can be visualized with the expression levels of a particular parameter applied as color scale
# dat: the data frame
# param.name: the name of the parameter which you would like to plot as color scale
# if no limits are provided: for better visualization, the most extreme low and high percentile of the color values are normalized.
# if limits are provided: the color scale will be adjusted according to the provided limits (recommended for parameters which are expressed on all cells). To apply the same colorscale as in Cytosplore, provide the limits as set in Cytosplore.
viz.umap <- function(dat,param.name,limits=NULL){
ColVal <- dat[,param.name]
if(is.null(limits)){
Lim <- quantile(ColVal,probs=seq(0,1,0.01))[c(2,100)]
p <- ggplot(dat, aes(x = umap_1, y =umap_2)) +geom_point(aes(color = ColVal), size=0.1)+theme_classic()+scale_color_distiller(name=param.name, palette = "RdYlBu", limits=Lim, oob=squish)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+ggtitle(param.name)
} else {
p <- ggplot(dat, aes(x = umap_1, y = umap_2)) +geom_point(aes(color = ColVal), size=0.1)+theme_classic()+scale_color_distiller(name=param.name, palette = "RdYlBu", limits=c(limits[1],limits[2]), oob=squish)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+ggtitle(param.name)
}
p
}
viz.umap(dat=df,param.name='CD27', limits=c(-0.02,3.17))
viz.umap(dat=df, param.name="CD45RA")
#visualize and label clusters on umap
label_HSNE_umap <- df%>%group_by(clusters_HSNE)%>%select(umap_1, umap_2)%>%summarize_all(mean)
label_flowsom_umap <- df%>%group_by(clusters_flowsom)%>%select(umap_1, umap_2)%>%summarize_all(mean)
label_pheno_umap <- df%>%group_by(clusters_phenograph)%>%select(umap_1, umap_2)%>%summarize_all(mean)
ggplot(df, aes(x=umap_1, y=umap_2, color=as.factor(clusters_HSNE)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_HSNE), data=label_HSNE_umap)+guides(colour=FALSE)
ggplot(df, aes(x=umap_1, y=umap_2, color=as.factor(clusters_flowsom)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_flowsom), data=label_flowsom_umap)+guides(colour=FALSE)
ggplot(df, aes(x=umap_1, y=umap_2, color=as.factor(clusters_phenograph)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_phenograph), data=label_pheno_umap)+guides(colour=FALSE)
#############################
######## FREQUENCIES ########
#############################
# As an example, we show the cluster frequencies of the HSNE clusters. To check the other cluster frequencies, adjust df$clusters_HSNE and change sampleID to fileName.
# check frequency of each cluster per sample
counts <- as.data.frame.matrix(table(df$sampleID,df$clusters_HSNE))
#calculate percentages of a certain sample present in cluster
counts_percofsample = counts/rowSums(counts)*100
counts_percofsample$total <- rowSums(counts)
counts_percofsample$sampleID <- row.names(counts_percofsample)
counts_percofsample <- melt(counts_percofsample, id.vars=c('sampleID', 'total'), variable.name='cluster', value.name='frequency')
ggplot(counts_percofsample, aes(x=reorder(cluster, frequency, FUN=median), y=frequency))+ geom_boxplot(position='dodge2', outlier.shape=NA)+geom_jitter(aes(colour=sampleID), size=2)+xlab('cluster')
ggplot(counts_percofsample, aes(x=cluster, y=frequency, fill=cluster))+geom_bar(stat='identity')+facet_wrap(~sampleID)+theme(axis.text.x=element_text(angle=30, vjust=.9, hjust=1))
############################
#### COMPUTE SLINGSHOT #####
############################
## To calculate pseudotime we will use Slingshot
## Slingshot requires clusters as input for the lineage identification. In this example, we use the HSNE based clusters.
## Too many clusters will lead to artificial lineages, therefore we merge the CD49b- and CD49b+ subset of each naive and memory population, and relabel them
df$merged_HSNE <-gsub("CD4-1$", "Naive", df$clusters_HSNE)
df$merged_HSNE <-gsub("CD4-2$", "Naive", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-3$", "CD27+ CM", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-4$", "CD27+ CM", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-5$", "CD27+ EM", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-6$", "CD27+ EM", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-7$", "CD27- EM", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-8$", "CD27- EM", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-9$", "EMRA", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-10$", "EMRA", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-11$", "CD8dim EMRA", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-12$", "CD8dim EM", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-13$", "CD69+CD103- EM", df$merged_HSNE)
df$merged_HSNE <-gsub("CD4-14$", "CD103+ EM", df$merged_HSNE)
#to visualize diffusionmap with merged clusters
label_merged_HSNE_dm <- df%>%group_by(merged_HSNE)%>%select(DC1, DC2)%>%summarize_all(mean)
ggplot(df, aes(x=DC1, y=DC2, color=merged_HSNE))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=merged_HSNE), data=label_merged_HSNE_dm)+guides(colour=FALSE)
##Create a Slingshot object and add cell expression and cluster information to this object
#create slingshot object
slingshot_object <- SingleCellExperiment(assays = List(norm = as.matrix(t(df))))
#add expression data to slingshot and select markers to use for the pseudotime calculation
reducedDims(slingshot_object) <- SimpleList(expressiondata = as.matrix(df%>%select(c("CD95", "CD8", "CD27", "CCR7", "CD45RA", "CD49b", "CD69", "CD103", "CD3", "CD4"))))
#Add clusters to slingshot object. Either the HSNE-based clusters, the Phenograph clusters, or flowsomclusters.
#In this example we demonstrate the use of the merged HSNE-based clusters.
colData(slingshot_object)$clusters <- df$merged_HSNE
#calculate lineages (optional: appoint starting cluster)
#In this example we appoint the naive T cells, as starting cluster.
lin <- getLineages(reducedDims(slingshot_object)$expressiondata,colData(slingshot_object)$clusters, start.clus='Naive')
#construct smooth curves and calculate pseudotime [This will take approximately 72 hours for this example dataset]
#to reduce computational time, curves can be approximated by a fixed number of points, for instance 100.
curve <- getCurves(lin, approx_points=FALSE)
#add pseudotime values to the data frame
df <- cbind(df, as.data.frame(slingPseudotime(curve), row.names=FALSE))
#generate table with diffusion map coordinates, pseudotimevalues and cluster assignment (either the HSNE-based clusters or other clusters)
#In this example we use the HSNE-based clusters
#If you would like to use other clusters, adjust 'merged_HSNE' to clusters_flowsom', for instance
pseudotimevalues <- df%>%select(c(DC1, DC2,clusterID=merged_HSNE, curve1, curve2, curve3))
#reshape table
pseudotimevalues <- melt(pseudotimevalues, id.vars=c('clusterID', 'DC1', 'DC2'), variable.name='lineage', value.name = 'pseudotime')
#rename curve to lineage
pseudotimevalues$lineage <- gsub('curve','lineage', pseudotimevalues$lineage)
#exclude cells with NA pseudotimevalues (those cells are not present in all lineages and have NA values for the lineages in which they are absent)
pseudotimevaluesexclNA <- pseudotimevalues%>%filter(pseudotime!='NA')
#generate colorpalette
colors <- colorRampPalette(rev(brewer.pal(11, 'Spectral'))[-6])(100)
#ggplot of each lineage colored by either pseudotime
ggplot(pseudotimevaluesexclNA%>%arrange(pseudotime), aes(x=DC1, y=DC2))+geom_point(aes(color=pseudotime),size=0.1, alpha=0.3) +facet_wrap(~lineage)+scale_color_gradientn(colours=colors)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())
#ggplot of each lineage colored by clusters (in our example, clusterID)
ggplot(pseudotimevaluesexclNA%>%arrange(pseudotime), aes(x=DC1, y=DC2))+geom_point(aes(color=clusterID),size =0.1)+facet_wrap(~lineage)+theme_bw()
#jitterplot of pseudotimevalues per lineage
ggplot(pseudotimevaluesexclNA%>%arrange(pseudotime), aes(x=pseudotime, y=lineage)) + geom_jitter(aes(color=clusterID), size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())
### save final df file ###
write.csv(df, 'df.csv')