-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathselectpin.R
executable file
·138 lines (109 loc) · 5.58 KB
/
selectpin.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
containment.indicator <- function(vstart, vend, wstart, wend){
lw <- length(wstart)
lv <- length(vstart)
z <- cbind(c(vend,wend),c(1:lv,rep(0,lw)),c(rep(0,lv),1:lw))
z<-z[order(z[,1]),]
endbeforeend<-cummax(z[,2])[order(z[,3])][sort(z[,3])!=0]
z<-cbind(c(wstart,vstart),c(rep((lv+1),lw),1:lv),c(1:lw,rep(0,lv)))
z<-z[order(z[,1]),]
startafterstart<-rev(cummin(rev(z[,2])))[order(z[,3])][sort(z[,3])!=0]
return(cbind(startafterstart,endbeforeend))
}
#'Generate the smear tables.
#'
#'Given the breakpoint tables of all single cells generated by \code{preprocess_segfile},
#'extend the breakpoint bins into intervals containing several bins to
#'allow for the uncertainty inherent in segmentation.
#'@param breakpoint_table The breakpoint_table generated by \code{preprocess_segfile}.
#'@param smear An integer. The number of spanning bins on one side. Default value: 2.
#'@param keepboundaries Logical. If TRUE (Default), the bins on chromosome boundaries are kept.
#'@param mask_XY Logical. If TRUE (Default), sex chromosomes (X, Y) are masked.
#'@return smear_table. The cell names (profid), the chromosome (chrom) and bin location (bpstart, bpend)
#'for the extended intervals with length of 2*smear and the change direction (bpsign).
#'@export
findsmears <- function(breakpoint_table, smear = 2, keepboundaries = TRUE, mask_XY = TRUE){
smearchar <- as.character(smear)
## censor the breakpoint table
censored<-(breakpoint_table[,"chromstart"]>=dropareas[breakpoint_table[,"chrom"],"from"])&
(breakpoint_table[,"chromend"]<=dropareas[breakpoint_table[,"chrom"],"to"])
censoredtoo<-(which(censored)+1)[(which(censored)+1)<=nrow(breakpoint_table)]
censored[censoredtoo]<-((breakpoint_table[censoredtoo,"chrom"]==breakpoint_table[censoredtoo-1,"chrom"])&
(breakpoint_table[censoredtoo,"profid"]==breakpoint_table[censoredtoo-1,"profid"]))|censored[censoredtoo]
## the smear table based on the breakpoint table
smear_table <- cbind(breakpoint_table[, c("profid", "chrom")],
breakpoint_table[,"segstarts"] - smear, breakpoint_table[,"segstarts"] + smear,
sign(breakpoint_table[,"cvals"] - c(0, breakpoint_table[-nrow(breakpoint_table), "cvals"])))
dimnames(smear_table)[[2]]<-c("profid", "chrom", "bpstart", "bpend", "bpsign")
## start and end seg(bin) location of each chromosome
chrstart <- breakpoint_table[match(unique(breakpoint_table[,"chrom"]), breakpoint_table[,"chrom"]), "segstarts"]
chrend <- c((chrstart - 1)[-1], breakpoint_table[nrow(breakpoint_table), "segends"])
## boundaries
if(!keepboundaries){
smear_table <- smear_table[((smear_table[,"bpstart"] + smear) > chrstart[smear_table[,"chrom"]])&!censored,]
smear_table[smear_table[,"bpstart"] < chrstart[smear_table[,"chrom"]], "bpstart"] <-
chrstart[smear_table[smear_table[,"bpstart"] < chrstart[smear_table[,"chrom"]], "chrom"]]
smear_table[smear_table[,"bpend"] > chrend[smear_table[,"chrom"]], "bpend"] <-
chrend[smear_table[smear_table[,"bpend"]>chrend[smear_table[,"chrom"]],"chrom"]]
}
if(keepboundaries){
smear_table[(smear_table[,"bpstart"]+smear)==chrstart[smear_table[,"chrom"]],"bpsign"]<-
2*smear_table[(smear_table[,"bpstart"]+smear)==chrstart[smear_table[,"chrom"]],"bpsign"]
smear_table<-smear_table[!censored,]
}
## X,Y chromosome
if(mask_XY){
chromrange = 1:22
}else{
chromrange = 1:24
}
smear_table <- smear_table[smear_table[, "chrom"]%in%chromrange,]
return(smear_table)
}
#'Select features and generate the incidence table.
#'
#'Select features (called as pins), generate the binary matrix with rows as pins and columns as cells.
#'@param breakpoint_table The breakpoint_table generated by \code{preprocess_segfile}.
#'@param smear_table The smear_table generated by \code{findsmears}.
#'@return A list of three objects. pinmat is the incidence table. pins is the bin location and sign for
#'the selected features (pins).
#'@export
findpins <- function(breakpoint_table, smear_table){
allsigns <- sort(unique(smear_table[, "bpsign"]))
for(vsign in allsigns){
a <- smear_table[smear_table[,"bpsign"] == vsign,]
a <- a[order(a[,"bpend"]),]
apins <- NULL
while(nrow(a) > 0){
apins <- c(apins, a[1, "bpend"])
a <- a[!(a[,"bpstart"] <= apins[length(apins)] & a[,"bpend"] >= apins[length(apins)]),,drop = F]
}
a <- smear_table[smear_table[,"bpsign"] == vsign,]
ci <- containment.indicator(apins, apins, a[order(a[,"bpend"]), "bpstart"], a[order(a[,"bpend"]), "bpend"])
a <- cbind(a[order(a[,"bpend"]),], ci)
dimnames(a)[[2]][(ncol(a)-1):ncol(a)]<-c("startpin","endpin")
apinmat<-matrix(ncol=length(unique(breakpoint_table[,"profid"])),nrow=length(apins)+2,data=0,
dimnames=list(NULL,unique(breakpoint_table[,"profid"])))
for(id in unique(a[,"profid"])){
apinmat[a[a[,"profid"]==id,"startpin"]+1,id]<-1
apinmat[a[a[,"profid"]==id,"endpin"]+2,id]<-apinmat[a[a[,"profid"]==id,"endpin"]+2,id]-1
}
apinmat<-apply(apinmat,2,cumsum)
apinmat<-apinmat[-c(1,nrow(apinmat)),,drop=F]
apins<-cbind(apins,rep(vsign,length(apins)))
dimnames(apins)[[2]]<-c("bin","sign")
if(vsign == min(allsigns)){
pinmat <- apinmat
pins <- apins
}
else{
pinmat <-rbind(pinmat, apinmat)
pins <- rbind(pins, apins)
}
}
pins <- pins[(rowSums(pinmat) < ncol(pinmat)),,drop = F]
pinmat <- pinmat[(rowSums(pinmat) < ncol(pinmat)),,drop = F]
cell_names <- colnames(pinmat)
return(list(pins = pins, pinmat = pinmat, cell_names = cell_names))
}
## tshort --> breakpoint_table
## dtshort --> smear_table