In [9]:
# https://cran.r-project.org/web/packages/penaltyLearning/index.html

In [10]:
# libraries
library(penaltyLearning)
library(data.table)

In [11]:
# previous or proposed
category <- 'previous'

In [12]:
# features engineering functions
f0 <- function(x) x
f1 <- function(x) log((x))
f2 <- function(x) log(log(x))

In [13]:
# dataset <- 'ATAC_JV_adipose'
# inputset <- c("max_diff", "sum_diff", "range_value")
# fe <- list(f0, f2, f1)

In [14]:
# dataset <- 'detailed'
# inputset <- c("range_value", "variance", "autocorr", "sum_diff", "iqr", "unique_count", "mean_diff", "max_diff", "percentile_75")
# fe <- list(f1, f1, f0, f2, f1, f1, f1, f1, f0)

In [15]:
dataset <- 'H3K27ac-H3K4me3_TDHAM_BP'
inputset <- c("mean", "variance", "range_value", "unique_count")
fe <- list(f1, f1, f1, f1)

In [16]:
# dataset <- 'systematic'
# inputset <- c("range_value", "variance", "autocorr", "sum_diff", "iqr", "unique_count", "mean_diff", "max_diff")
# fe <- list(f1, f1, f0, f2, f1, f1, f1, f1)

In [17]:
# read data
feature.dt <- fread(paste("../../training_data/", dataset, "/features.csv", sep = ""))
target.dt  <- fread(paste("../../training_data/", dataset, "/target.csv", sep = ""))
folds.dt   <- fread(paste("../../training_data/", dataset, "/folds.csv", sep = ""))

# get number of folds
n_folds <- dim(unique(folds.dt[,"fold"]))[1]

# filter censored intervals
target.dt  <- target.dt[!(min.log.lambda == -Inf & max.log.lambda == Inf)]
feature.dt <- feature.dt[sequenceID %in% target.dt$sequenceID]
folds.dt   <- folds.dt[sequenceID %in% target.dt$sequenceID]

# combine
feature.dt.all <- cbind(feature.dt[, c("sequenceID", ..inputset)], folds.dt[, "fold"])
target.dt.all  <- cbind(target.dt, folds.dt[, "fold"])

# apply feature engineering
feature.dt.all[, (inputset) := mapply(function(col, func) func(feature.dt.all[[col]]), col = inputset, func = fe, SIMPLIFY = FALSE)]

for (test.fold in 1:n_folds) {
    # get train and test data
    feature.mat.train <- as.matrix(feature.dt.all[feature.dt.all$fold != test.fold, ..inputset])
    feature.mat.test  <- as.matrix(feature.dt.all[feature.dt.all$fold == test.fold, ..inputset])
    target.mat.train  <- as.matrix(target.dt.all[target.dt.all$fold != test.fold, c("min.log.lambda", "max.log.lambda")])
    
    # train model
    fit <- IntervalRegressionUnregularized(
        feature.mat = feature.mat.train,
        target.mat = target.mat.train)
    
    # get prediction from test set
    target.mat.pred <- fit$predict(feature.mat.test)
    colnames(target.mat.pred) <- "llda"

    # save prediction to csv
    prediction <- data.frame(as.matrix(feature.dt.all[feature.dt.all$fold == test.fold, "sequenceID"]), target.mat.pred)
    write.csv(prediction, file = paste("predictions/", paste(category, dataset, test.fold, length(inputset), "csv", sep = "."), sep=''), row.names = FALSE)
}