In [1]:
library(mmit)
library(future.apply)

Loading required package: future



In [10]:
for (dataset in c("detailed", "systematic", "epigenomic")) {
    for (test.fold in c(1, 2, 3, 4, 5, 6)) {
        for (inputset in list(c("length"), c("length", "variance"), c("length", "variance", "range_value", "sum_diff"))) {
            # get dfs for all folds
            inputs.all.df  <- read.csv(paste("training_data/", dataset, "/inputs.csv", sep = ""))
            outputs.all.df <- read.csv(paste("training_data/", dataset, "/outputs.csv", sep = ""))
            folds.df <- read.csv(paste("training_data/", dataset, "/folds.csv", sep = ""))

            inputs.all.df <- cbind(inputs.all.df, folds.df)
            outputs.all.df <- cbind(outputs.all.df, folds.df)

            # get train set and test set
            train.inputs.df <- inputs.all.df[inputs.all.df$fold != test.fold, inputset, drop = FALSE]
            train.inputs.df <- cbind(train.inputs.df, constant = 1)
            test.inputs.df  <- inputs.all.df[inputs.all.df$fold == test.fold, inputset, drop = FALSE]
            test.inputs.df  <- cbind(test.inputs.df, constant = 1)

            train.inputs.df <- data.frame(train.inputs.df)
            test.inputs.df  <- data.frame(test.inputs.df)

            train.outputs.df <- outputs.all.df[outputs.all.df$fold != test.fold, ][, c("min.log.lambda", "max.log.lambda")]
            train.outputs.df <- as.matrix(train.outputs.df)

            # cv to pick the best params in train set
            param_grid <- NULL
            param_grid$margin <- 1
            param_grid$loss <- "square"
            param_grid$max_depth <- c(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, Inf)
            param_grid$min_sample <- c(5, 10, 15, 20, 25, 30)

            set.seed(4)
            result <- mmit.cv(
                feature.mat = train.inputs.df,
                target.mat = train.outputs.df,
                param_grid = param_grid,
                n_folds = 2,
                scorer = mse,
                pruning = TRUE,
                future.seed = TRUE
            )

            # train the model with all train set by best params
            tree <- mmit(feature.mat = train.inputs.df,
                        target.mat = train.outputs.df,
                        max_depth = result$best_params$max_depth,
                        margin = 1,
                        loss = "square",
                        min_sample = result$best_params$min_sample)
            
            # produce prediction for test set
            pred <- predict.mmit(tree, test.inputs.df)

            # save to csv
            sequenceID  <- inputs.all.df[inputs.all.df$fold == test.fold, 'sequenceID', drop = FALSE]
            prediction <- cbind(sequenceID, pred)
            names(prediction) <- c("sequenceID", "llda")
            write.csv(prediction, file = paste("predictions/", dataset, test.fold, length(inputset), "csv", sep = "."), row.names = FALSE)
        }
    }
}