In [1]:
# https://github.com/aldro61/mmit

In [2]:
library(mmit)
library(data.table)
library(future.apply)

Loading required package: future



In [3]:
# Get the names of all folders in a specified directory
datasets <- list.dirs(path = "../../data", full.names = FALSE, recursive = FALSE)

In [4]:
margin_list     <- c(0)
loss_list       <- c("square")
max_depth_list  <- c(1, 2, 3, 5, 7, 10, 20, 50, 100, 200, 500, 1000)
min_sample_list <- c(2, 5, 10, 30, 50, 100, 300, 500)

In [5]:
for (dataset in datasets[1:4]) {
    folds.df <- fread(paste("../../data/", dataset, "/folds.csv", sep = ""))
    n_folds  <- dim(unique(folds.df[,"fold"]))[1]

    inputs.all.df  <- read.csv(paste("../../data/", dataset, "/features.csv", sep = ""))
    outputs.all.df <- read.csv(paste("../../data/", dataset, "/targets.csv", sep = ""))

    for (test.fold in c(1:n_folds)) {
        inputs.all.df  <- cbind(inputs.all.df, folds.df)
        outputs.all.df <- cbind(outputs.all.df, folds.df)

        # get train set and test set
        train.inputs.df <- inputs.all.df[inputs.all.df$fold != test.fold, !(names(inputs.all.df) %in% "fold"), drop = FALSE]
        train.inputs.df <- cbind(train.inputs.df, constant = 1)
        test.inputs.df  <- inputs.all.df[inputs.all.df$fold == test.fold, !(names(inputs.all.df) %in% "fold"), drop = FALSE]
        test.inputs.df  <- cbind(test.inputs.df, constant = 1)

        train.inputs.df <- data.frame(train.inputs.df)
        test.inputs.df  <- data.frame(test.inputs.df)

        train.outputs.df <- outputs.all.df[outputs.all.df$fold != test.fold, , drop = FALSE][, !(names(outputs.all.df) %in% "fold"), drop = FALSE]
        train.outputs.df <- as.matrix(train.outputs.df)

        # cv to pick the best params in train set
        param_grid <- NULL
        param_grid$margin <- margin_list
        param_grid$loss <- loss_list
        param_grid$max_depth <- max_depth_list
        param_grid$min_sample <- min_sample_list

        set.seed(4)
        result <- mmit.cv(
            feature.mat = train.inputs.df,
            target.mat = train.outputs.df,
            param_grid = param_grid,
            n_folds = 2,
            scorer = mse,
            pruning = TRUE,
            future.seed = TRUE
        )

        # train the model with all train set by best params
        tree <- mmit(feature.mat = train.inputs.df,
                    target.mat   = train.outputs.df,
                    max_depth    = result$best_params$max_depth,
                    margin       = result$best_params$margin,
                    loss         = result$best_params$loss,
                    min_sample   = result$best_params$min_sample)
        
        # produce prediction for test set
        target.mat.pred <- predict.mmit(tree, test.inputs.df)

        # Convert to data frame and set column name
        prediction <- data.frame(pred = target.mat.pred)

        # Save prediction to CSV
        output_file <- paste0("predictions/", dataset, ".", test.fold, ".csv")
        write.csv(prediction, file = output_file, row.names = FALSE)
    }
}

"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of vector length (arg 2)"
"number of columns of result is not a multiple of v