# scATAC data export

- Export metadata.tsv after replacing with new cluster labels and adding in pseudotime info. 
- Export peaks.bed after adding in peak set labels

In [43]:
library(ggplot2)

### Metadata

In [25]:
metadata = read.table("../20200206_pmat_snapATAC/sessions/20210717_n62599/metadata.tsv", 
                      header=T,
                      stringsAsFactors=F)

metadata$old_cluster = metadata$cluster
metadata$cluster = NULL
head(metadata)

Unnamed: 0_level_0,barcode,sample,umap1,umap2,old_cluster
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<int>
1,AAACACGCTAGTACGG,D0,-1.8037916,-9.507773,1
2,AAACACGCTCTTCAAG,D0,0.7074551,-9.406457,1
3,AAACACGCTGATCCAT,D0,-2.9559933,-7.641148,1
4,AAACCCTGACCGCTAG,D0,-3.6119107,-8.443144,1
5,AAACCCTGATTGCAGT,D0,0.5002119,-8.759449,3
6,AAACCGTACACACCTA,D0,-1.7333966,-9.259262,1


In [26]:
dim(metadata)

In [27]:
cluster_config = read.table("../../figures_factory/configs/cluster.tsv",comment.char = '', sep='\t', header=T, stringsAsFactors=F)
cluster_config

cluster,colour,description,new_cluster
<int>,<chr>,<chr>,<int>
1,#B03743,Fibroblast,1
2,#E85F6D,Fibroblast-like,3
3,#7C2730,Fibroblast-like,2
4,#F0A118,Intermediate,12
5,#78A824,Pre-iPSC,13
6,#A6D854,Pre-iPSC,14
7,#F01D35,Fibroblast-like,5
8,#13C4A3,iPSC,15
9,#406614,Partially-reprogrammed,10
10,#D4B81C,Intermediate,9


In [28]:
metadata$cluster = cluster_config[metadata$old_cluster, "new_cluster"]
metadata$old_cluster = NULL
head(metadata)

Unnamed: 0_level_0,barcode,sample,umap1,umap2,cluster
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<int>
1,AAACACGCTAGTACGG,D0,-1.8037916,-9.507773,1
2,AAACACGCTCTTCAAG,D0,0.7074551,-9.406457,1
3,AAACACGCTGATCCAT,D0,-2.9559933,-7.641148,1
4,AAACCCTGACCGCTAG,D0,-3.6119107,-8.443144,1
5,AAACCCTGATTGCAGT,D0,0.5002119,-8.759449,2
6,AAACCGTACACACCTA,D0,-1.7333966,-9.259262,1


In [35]:
pseudotime = read.table("../20200217_trajectory/sessions/20211003_n62599/pseudotime.tsv", sep='\t', header=T, row.names=1,
                       stringsAsFactors=F)
head(pseudotime)

Unnamed: 0_level_0,sample_barcode,dpt_pseudotime_fibr,dpt_pseudotime_high_OSK
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
0,D0_AAACACGCTAGTACGG,0.0,0.8747373
1,D0_AAACACGCTCTTCAAG,0.0388495,0.8782327
2,D0_AAACACGCTGATCCAT,0.10171443,0.8466607
3,D0_AAACCCTGACCGCTAG,0.09229346,0.8495939
4,D0_AAACCCTGATTGCAGT,0.05027944,0.8626735
5,D0_AAACCGTACACACCTA,0.03454594,0.8612965


In [39]:
all(pseudotime$sample_barcode == paste(metadata$sample, metadata$barcode, sep='_'))

In [40]:
metadata$dpt_pseudotime_fibr_root = pseudotime$dpt_pseudotime_fibr
metadata$dpt_pseudotime_xOSK_root = pseudotime$dpt_pseudotime_high_OSK

In [104]:
head(metadata)

Unnamed: 0_level_0,barcode,sample,umap1,umap2,cluster,dpt_pseudotime_fibr_root,dpt_pseudotime_xOSK_root
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
1,AAACACGCTAGTACGG,D0,-1.8037916,-9.507773,1,0.0,0.8747373
2,AAACACGCTCTTCAAG,D0,0.7074551,-9.406457,1,0.0388495,0.8782327
3,AAACACGCTGATCCAT,D0,-2.9559933,-7.641148,1,0.10171443,0.8466607
4,AAACCCTGACCGCTAG,D0,-3.6119107,-8.443144,1,0.09229346,0.8495939
5,AAACCCTGATTGCAGT,D0,0.5002119,-8.759449,2,0.05027944,0.8626735
6,AAACCGTACACACCTA,D0,-1.7333966,-9.259262,1,0.03454594,0.8612965


In [55]:
write.table(metadata, file="/oak/stanford/groups/akundaje/surag/projects/reprog-package/analysis/scATAC/cells.tsv",
           sep='\t', row.names=F, quote=F)

## Peaks

In [75]:
peaks = read.table("../20200206_pmat_snapATAC/sessions/20210717_n62599/peaks.bed", stringsAsFactors=F)
colnames(peaks) = c("chr", "start", "end")
head(peaks)

Unnamed: 0_level_0,chr,start,end
Unnamed: 0_level_1,<chr>,<int>,<int>
1,chr17,32960817,32961317
2,chr17,75764252,75764752
3,chr17,1473114,1473614
4,chr17,4206897,4207397
5,chr17,423409,423909
6,chr17,1964634,1965134


In [60]:
peak_sets = list()
for (i in 1:20) {
    s = read.table(sprintf("../20200307_fine_clustering/beds/20210719_gridmap_peakwidthnorm_logplusznorm_4way_assited_n20/idx%s.bed", i),
                  stringsAsFactors=F)
    colnames(s) = c("chr", "start", "end")
    peak_sets[[i]] = s
}

In [71]:
peak_idx_to_name = c("OC1",
"OC2",
"OC3",
"OC4",
"K1",
"COC/E1",
"COC/E2",
"COC/E3",
"COC/L1",
"COC/L2",
"COC/L3",
"COC/L4",
"COC/L5",
"CO/E1",
"CO/E2",
"CO/L1",
"CO/L2",
"S1",
"S2",
"S3")
peak_idx_to_name

In [87]:
peaks_w_set = peaks
peaks_w_set$set = NA

for (i in 1:20) {
    print(i)
    belongs_to = paste(peaks$chr, peaks$start, peaks$end) %in% paste(peak_sets[[i]]$chr, peak_sets[[i]]$start, peak_sets[[i]]$end) 
    stopifnot(sum(belongs_to)==nrow(peak_sets[[i]]))
    
    peaks_w_set[belongs_to, "set"] = peak_idx_to_name[i]
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20


In [97]:
table(peaks_w_set$set)


 CO/E1  CO/E2  CO/L1  CO/L2 COC/E1 COC/E2 COC/E3 COC/L1 COC/L2 COC/L3 COC/L4 
 15179  23696  32573  37539  30214  23500  27541   6907  18844  17471  13119 
COC/L5     K1    OC1    OC2    OC3    OC4     S1     S2     S3 
 21090  28388  39006  51449  21137  37309  19750  32863  28260 

In [89]:
head(peaks_w_set)

Unnamed: 0_level_0,chr,start,end,set
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>
1,chr17,32960817,32961317,OC4
2,chr17,75764252,75764752,CO/E1
3,chr17,1473114,1473614,S3
4,chr17,4206897,4207397,OC4
5,chr17,423409,423909,OC4
6,chr17,1964634,1965134,K1


In [99]:
# these are peaks not assigned to any set
sum(!(peaks_w_set$set %in% peak_idx_to_name))

In [101]:
peaks_w_set[sample(nrow(peaks_w_set), 10), ]

Unnamed: 0_level_0,chr,start,end,set
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>
100578,chr2,121648647,121649147,S2
516749,chrX,139903121,139903621,COC/L4
369365,chr19,48111209,48111709,CO/L1
102261,chr2,234191738,234192238,OC2
158,chr17,13695960,13696460,COC/E1
196741,chr7,90764762,90765262,CO/L2
383167,chr6,19289512,19290012,COC/L2
485170,chr22,50423557,50424057,K1
379676,chr6,79668379,79668879,COC/E2
403678,chr6,10954461,10954961,S2


In [103]:
write.table(peaks_w_set, file="/oak/stanford/groups/akundaje/surag/projects/reprog-package/analysis/scATAC/peaks.bed",
           sep='\t', row.names=F, quote=F, col.names=F)