-
Notifications
You must be signed in to change notification settings - Fork 0
/
LOPART-figure-label-errors-data.R
113 lines (89 loc) · 3.8 KB
/
LOPART-figure-label-errors-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# need to run Download-All-H3K-Data.R first
# Load in all data sets
library(data.table)
xmaxJumpRule <- function (seg.dt, count.dt){
seg.dt[, diff := c(diff(mean),NA)]
seg.dt[, sign := sign(diff)]
seg.dt[, new.group := c(TRUE,diff(sign) != 0)]
seg.dt[, group.i := cumsum(new.group)]
group.dt <- seg.dt[, .SD[which.max(abs(diff)), .(end, diff, sign, mean)], by=group.i]
chromEnd.at.change <- count.dt$chromEnd[group.dt$end]
if (nrow(group.dt)>0){
group.dt[, .(mean = c(mean,1),
chromStart=c(count.dt$chromStart[1], chromEnd.at.change),
chromEnd=c(chromEnd.at.change, count.dt[.N, chromEnd]),
status=rep(if(sign[1]==1)c("background","peak") else c("peak","background"), l=.N+1)
)]
} else
{
group.dt <- data.table( mean = numeric(), chromStart=numeric(), chromEnd=numeric(), status=character())
}
}
source("Load-All-H3k-Data.R")
H3K_data <- loadH3KData()
# increase above 10 log scale
penalties <- 10^seq(-5, 6, by=0.5)
n.fold <- 2
model <- "LOPART"
sets <- list("train","test")
cache.prefix <- "LOPART-figure-label-errors-data"
for(dataset in 1:length(H3K_data$count)){
print(dataset)
one_count <- H3K_data$count[[dataset]]
one_label <- H3K_data$labels[[dataset]]
sample_split_count <- split(one_count, one_count$sample.id)
sample_split_label <- split(one_label, one_label$sample.id)
if (!('peaks' %in% one_label$annotation)){
for (sample.id in names(sample_split_count)){
sample.err.list <- list()
cache.save <- paste(dataset, "-", sample.id, ".csv", sep = "")
cache.file <- file.path(cache.prefix, cache.save)
one_sample_count <- sample_split_count[[sample.id]]
if(!file.exists(cache.file)){
print(sample.id)
one_sample_label <- sample_split_label[[sample.id]]
# for each penalty
for(pen in penalties){
# for each fold
for(fold in 1:n.fold){
segs.list <- list()
setDT(one_sample_label)
one_sample_label[, set := ifelse(fold == random.fold, "test", "train")]
one_sample_label[, changes := ifelse(annotation == "noPeaks", 0, 1)]
flopart_data <- FLOPART::FLOPART_data(one_sample_count, one_sample_label)
lopart_count <- flopart_data$coverage_dt
lopart_labels <- flopart_data$label_dt[order(chromStart)]
one_sample_label <- one_sample_label[order(chromStart)]
one_sample_label[, start := lopart_labels$firstRow + 1]
one_sample_label[, end := lopart_labels$lastRow + 1]
fit <- LOPART::POISSON_LOPART(lopart_count$count,
lopart_count$weight, one_sample_label[set == "train"], pen)
seg.dt <- maxJumpRule(data.table(fit$segments), lopart_count)
pkg.segs <- seg.dt[, .(chromStart, chromEnd, mean, status)]
pkg.peaks <- pkg.segs[status=="peak"]
for (set.i in sets){
err.dt <- PeakError::PeakErrorChrom(pkg.peaks, one_sample_label[set == set.i])
sample.err.list[[paste(dataset, sample.id, pen, fold, model, set.i)]] <- data.table(
dataset,
sample.id,
pen,
fold,
model,
set.i,
possible.fp = sum(err.dt$possible.fp),
fp = sum(err.dt$fp),
possible.fn = sum(err.dt$possible.tp),
fn = sum(err.dt$fn),
labels = nrow(one_sample_label[set == set.i])
)
}
}
}
sample.err.dt <- do.call(rbind, sample.err.list)
dir.create(dirname(cache.file), showWarnings = FALSE, recursive = TRUE)
data.table::fwrite(sample.err.dt, cache.file)
gc()
}
}
}
}