In [1]:
# https://github.com/aldro61/mmit

In [2]:
library(mmit)
library(data.table)
library(future.apply)

Loading required package: future



In [3]:
category        <- 'previous'
data_list       <- list.dirs(path = "../../training_data", full.names = FALSE, recursive = FALSE)
margin_list     <- c(1, 2)
loss_list       <- c("hinge", "square")
max_depth_list  <- c(5, 10)
min_sample_list <- c(2, 10, 20, 100)

In [4]:
for (dataset in data_list) {
    folds.df <- fread(paste("../../training_data/", dataset, "/folds.csv", sep = ""))
    n_folds  <- dim(unique(folds.df[,"fold"]))[1]

    for (test.fold in c(1:n_folds)) {
        for (inputset in list(c("count"), c("count", "variance"), c("count", "variance", "range_value", "sum_diff"))) {
            # get dfs for all folds
            inputs.all.df  <- read.csv(paste("../../training_data/", dataset, "/features.csv", sep = ""))
            outputs.all.df <- read.csv(paste("../../training_data/", dataset, "/target.csv", sep = ""))

            inputs.all.df <- cbind(inputs.all.df, folds.df)
            outputs.all.df <- cbind(outputs.all.df, folds.df)

            # get train set and test set
            train.inputs.df <- inputs.all.df[inputs.all.df$fold != test.fold, inputset, drop = FALSE]
            train.inputs.df <- cbind(train.inputs.df, constant = 1)
            test.inputs.df  <- inputs.all.df[inputs.all.df$fold == test.fold, inputset, drop = FALSE]
            test.inputs.df  <- cbind(test.inputs.df, constant = 1)

            train.inputs.df <- data.frame(train.inputs.df)
            test.inputs.df  <- data.frame(test.inputs.df)

            train.outputs.df <- outputs.all.df[outputs.all.df$fold != test.fold, ][, c("min.log.lambda", "max.log.lambda")]
            train.outputs.df <- as.matrix(train.outputs.df)

            # cv to pick the best params in train set
            param_grid <- NULL
            param_grid$margin <- margin_list
            param_grid$loss <- loss_list
            param_grid$max_depth <- max_depth_list
            param_grid$min_sample <- min_sample_list

            set.seed(4)
            result <- mmit.cv(
                feature.mat = train.inputs.df,
                target.mat = train.outputs.df,
                param_grid = param_grid,
                n_folds = 2,
                scorer = mse,
                pruning = TRUE,
                future.seed = TRUE
            )

            # train the model with all train set by best params
            tree <- mmit(feature.mat = train.inputs.df,
                        target.mat   = train.outputs.df,
                        max_depth    = result$best_params$max_depth,
                        margin       = result$best_params$margin,
                        loss         = result$best_params$loss,
                        min_sample   = result$best_params$min_sample)
            
            # produce prediction for test set
            pred <- predict.mmit(tree, test.inputs.df)

            # save to csv
            sequenceID  <- inputs.all.df[inputs.all.df$fold == test.fold, 'sequenceID', drop = FALSE]
            prediction <- cbind(sequenceID, pred)
            names(prediction) <- c("sequenceID", "llda")
            write.csv(prediction, file = paste("predictions/", paste(category, dataset, test.fold, length(inputset), "csv", sep = "."), sep=''), row.names = FALSE)
        }
    }
}

"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of v