In [13]:
# https://cran.r-project.org/web/packages/penaltyLearning/index.html

In [14]:
# libraries
library(penaltyLearning)
library(data.table)

In [15]:
# previous or proposed
category <- 'proposed'

In [16]:
# choose number of feature
n_features <- 10

In [17]:
# Get the names of all folders in a specified directory
datasets <- list.dirs(path = "../../training_data", full.names = FALSE, recursive = FALSE)

In [18]:
# training
for (dataset in datasets) {
    # read data
    feature.dt <- fread(paste("../../training_data/", dataset, "/features_sorted.csv", sep = ""))
    target.dt  <- fread(paste("../../training_data/", dataset, "/target.csv", sep = ""))
    folds.dt   <- fread(paste("../../training_data/", dataset, "/folds.csv", sep = ""))

    # Apply transformations
    feature.dt[, sum_diff      := log(log(sum_diff))]
    feature.dt[, max_diff      := log(max_diff)]
    feature.dt[, std_deviation := log(std_deviation)]
    feature.dt[, mean          := log(mean)]
    feature.dt[, variance      := log(variance)]
    feature.dt[, max_value     := log(max_value)]
    feature.dt[, range_value   := log(range_value)]
    feature.dt[, iqr           := log(iqr)]
    feature.dt[, kurtosis      := log(kurtosis)]
    feature.dt[, count         := log(log(count))]
    feature.dt[, unique_count  := log(unique_count)]

    feature.dt[, names(feature.dt) := lapply(.SD, function(x) { x[is.nan(x)] <- 0; x })]

    # get number of folds
    n_folds <- dim(unique(folds.dt[,"fold"]))[1]
    
    # filter censored intervals
    target.dt  <- target.dt[!(min.log.lambda == -Inf & max.log.lambda == Inf)]
    feature.dt <- feature.dt[sequenceID %in% target.dt$sequenceID]
    folds.dt   <- folds.dt[sequenceID %in% target.dt$sequenceID]

    inputset <- colnames(feature.dt)[2: (n_features+1)]    

    # combine
    feature.dt.all <- cbind(feature.dt[, c("sequenceID", ..inputset)], folds.dt[, "fold"])
    target.dt.all  <- cbind(target.dt, folds.dt[, "fold"])

    for (test.fold in 1:n_folds) {
        # get train and test data
        feature.mat.train <- as.matrix(feature.dt.all[feature.dt.all$fold != test.fold, ..inputset])
        feature.mat.test  <- as.matrix(feature.dt.all[feature.dt.all$fold == test.fold, ..inputset])
        target.mat.train  <- as.matrix(target.dt.all[target.dt.all$fold != test.fold, c("min.log.lambda", "max.log.lambda")])
        
        # train model
        fit <- IntervalRegressionUnregularized(
            feature.mat = feature.mat.train,
            target.mat = target.mat.train)
        
        # get prediction from test set
        target.mat.pred <- fit$predict(feature.mat.test)
        colnames(target.mat.pred) <- "llda"

        # save prediction to csv
        prediction <- data.frame(as.matrix(feature.dt.all[feature.dt.all$fold == test.fold, "sequenceID"]), target.mat.pred)
        write.csv(prediction, file = paste("predictions/", paste(category, dataset, test.fold, length(inputset), "csv", sep = "."), sep=''), row.names = FALSE)
    }
}

"NaNs produced"
"NaNs produced"
"NaNs produced"
"NaNs produced"
"NaNs produced"
"NaNs produced"
"NaNs produced"
