# Setup

In [1]:
# Get VM CPU and R version
shell_call <- function(command, ...) {
  result <- system(command, intern = TRUE, ...)
  cat(paste0(result, collapse = "\n"))
}
shell_call("grep -m1 'model name' /proc/cpuinfo | awk -F': ' '{printf \" CPU Model: %s \\n \",  $2}'")
shell_call("grep 'cpu cores' /proc/cpuinfo  | awk -F': ' '{a[cores]+=$2}END{printf \"CPU Cores: %s \\n \", a[cores] }'")
shell_call("grep MemTotal /proc/meminfo | awk '{printf \"RAM: %.1fGB \\n \", $2 / 1024 / 1024}'")
shell_call("R --version | head -n 1")

 CPU Model: Intel(R) Xeon(R) CPU @ 2.20GHz 
 CPU Cores: 72 
 RAM: 83.5GB 
 R version 4.4.1 (2024-06-14) -- "Race for Your Life"

In [2]:
# Get GPU Info
shell_call("nvidia-smi")

Tue Aug 27 03:11:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              44W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# Install Torch with valid CUDA version

options(timeout = 600) # increasing timeout is recommended since we will be downloading a 2GB file.
# For Windows and Linux: "cpu", "cu117", "cu118" are the only currently supported
# For MacOS the supported are: "cpu-intel" or "cpu-m1"
kind <- "cu118"
version <- available.packages()["torch","Version"]
options(repos = c(
  torch = sprintf("https://torch-cdn.mlverse.org/packages/%s/%s/", kind, version),
  CRAN = "https://cloud.r-project.org" # or any other from which you want to install the other R dependencies.
))

install.packages("torch")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘coro’, ‘safetensors’




In [4]:
# Test Torch installation

library(torch)
torch_rand(4)

torch_tensor
 0.2153
 0.1182
 0.1141
 0.2897
[ CPUFloatType{4} ]

In [5]:
# Install BKTR

install.packages('BKTR')

### From Github (Latest Version)
# install.packages("devtools") # if not installed
# devtools::install_github("julien-hec/BKTR", ref = "main")

# For section 4 side by side plots
# install.packages('ggpubr')

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘collections’, ‘png’, ‘plyr’, ‘jpeg’, ‘bitops’, ‘R6P’, ‘ggmap’




In [6]:
# The following block is because Jupyter for R does not print until the end of
# the code block execution. So, with this command we add real time printing.
# See: https://stackoverflow.com/questions/37689694/real-time-printing-to-console-with-r-in-jupyter
trace(what = "print", where = getNamespace("base"), exit = flush.console, print = FALSE)

Tracing function "print" in package "namespace:base"



# Load Libraries

In [7]:
# Code to run BKTR examples #
# Lanthier, Lei, Sun and Labbe 2023 #

library('BKTR')
library(data.table)
library(ggplot2)
# library('ggpubr')

# Run BKTR

## Results by lenghtscale and dataset size

In [29]:

##########################
### 5.3: Interpolation ###
##########################

# Part for Table 6, influence of lengthscale values and dataset size
nb_aside_locs <- 4
nb_aside_times <- 6

# Set seed and calculation params
TSR$set_params(seed = 2, fp_type = 'float32', fp_device = 'cuda')
res_colnames <- c(
    'Dataset_Type', 'Lengthscale', 'Iter', 'B_MAE',
    'B_RMSE', 'Y_MAE', 'Y_RMSE', 'Time'
)
nb_res_cols <- length(res_colnames)
res_vals <- c()

# Run simulation for different lengthscale and dataset size
for (ds_type in c('Smaller', 'Larger')) {
  for (len_scale in c(3, 6)) {
    for (i in 1:10) {
      # Adding some progression debug prints
      debug_msg <- sprintf(
        'DS Type: %s, Lengthscale: %d, Iter: %02d',
        ds_type, len_scale, i
      )
      print(debug_msg)

      matern_lengthscale <- KernelParameter$new(value = len_scale)
      se_lengthscale <- KernelParameter$new(value = len_scale)
      spatial_kernel <- KernelMatern$new(
        lengthscale = matern_lengthscale, smoothness_factor = 5)
      temporal_kernel <- KernelSE$new(lengthscale = se_lengthscale)

      is_small_ds <- ds_type == 'Smaller'
      nb_locs <- ifelse(is_small_ds, 20, 100)
      nb_times <- ifelse(is_small_ds, 30, 150)
      spa_cov_means <- if (is_small_ds) c(0, 2) else c(0, 2, 4)
      tem_cov_means <- if (is_small_ds) c(1) else c(1, 3)

      simu_data <- simulate_spatiotemporal_data(
        nb_locations = nb_locs,
        nb_time_points = nb_times,
        nb_spatial_dimensions = 2,
        spatial_scale = 10,
        time_scale = 10,
        spatial_covariates_means = spa_cov_means,
        temporal_covariates_means = tem_cov_means,
        spatial_kernel = spatial_kernel,
        temporal_kernel = temporal_kernel,
        noise_variance_scale = 1
      )

      # Set some values aside for M_new locs and N_new times
      obs_nb_locs <- nb_locs - nb_aside_locs
      obs_nb_times <- nb_times - nb_aside_times

      data_df <- simu_data$data_df
      spatial_pos_df <- simu_data$spatial_positions_df
      temporal_pos_df <- simu_data$temporal_positions_df

      all_locs <- spatial_pos_df$location
      all_times <- temporal_pos_df$time

      locs_indx_sample <- TSR$rand_choice(
        TSR$tensor(1:length(all_locs)), obs_nb_locs)
      obs_locs <- all_locs[as.numeric(locs_indx_sample$cpu())]
      new_locs <- setdiff(all_locs, obs_locs)

      times_indx_sample <- TSR$rand_choice(
        TSR$tensor(1:length(all_times)), obs_nb_times)
      obs_times <- all_times[as.numeric(times_indx_sample$cpu())]
      new_times <- setdiff(all_times, obs_times)

      obs_data_df <- data_df[
        data_df[, .I[location %in% obs_locs & time %in% obs_times]], ]
      obs_spatial_pos_df <- spatial_pos_df[
        spatial_pos_df[, .I[location %in% obs_locs]], ]
      obs_temporal_pos_df <- temporal_pos_df[
        temporal_pos_df[, .I[time %in% obs_times]], ]

      new_data_df <- data_df[
        data_df[, .I[location %in% new_locs | time %in% new_times]], ]
      new_spatial_positions_df <- spatial_pos_df[
        spatial_pos_df[, .I[location %in% new_locs]], ]
      new_temporal_positions_df <- temporal_pos_df[
        temporal_pos_df[, .I[time %in% new_times]], ]

      # Run mcmc sampling
      bktr_regressor <- BKTRRegressor$new(
        data_df = obs_data_df,
        rank_decomp = 10,
        burn_in_iter = 500,
        sampling_iter = 500,
        spatial_kernel = KernelMatern$new(smoothness_factor = 5),
        spatial_positions_df = obs_spatial_pos_df,
        temporal_kernel = KernelSE$new(),
        temporal_positions_df = obs_temporal_pos_df,
        has_geo_coords = FALSE
      )
      # Hide output of sampling because its volume creates notebook errors
      .unused_out <- capture.output(bktr_regressor$mcmc_sampling())

      # Run interpolation
      preds <- bktr_regressor$predict(
        new_data_df,
        new_spatial_positions_df,
        new_temporal_positions_df
      )

      # Align both datasets
      sim_data_df <- simu_data$data_df
      pred_y_df <- preds$new_y_df
      beta_data_df <- simu_data$beta_df
      beta_pred_df <- preds$new_beta_df
      setkey(beta_pred_df, location, time)
      sim_y_df <- sim_data_df[
        sim_data_df[, .I[location %in% new_locs | time %in% new_times]],
        c('location', 'time', 'y')
      ]
      setorderv(pred_y_df, c('location', 'time'))
      setorderv(sim_y_df, c('location', 'time'))

      # Calc Errors
      preds_y_err <- (
        sim_data_df[
          sim_data_df[, .I[location %in% new_locs | time %in% new_times]],
          'y']
        - pred_y_df[
            pred_y_df[, .I[location %in% new_locs | time %in% new_times]],
            'y_est']
      )
      preds_y_err <- unlist(preds_y_err)
      preds_beta_err <- (
        beta_data_df[
          beta_data_df[, .I[location %in% new_locs | time %in% new_times]],
          -c('location', 'time')]
        - beta_pred_df[
            beta_pred_df[, .I[location %in% new_locs | time %in% new_times]],
            -c('location', 'time')]
      )
      preds_beta_err <- unlist(preds_beta_err)

      y_rmse <- sqrt(mean(preds_y_err^2))
      y_mae <- mean(abs(preds_y_err))
      beta_rmse <- sqrt(mean(preds_beta_err^2))
      beta_mae <- mean(abs(preds_beta_err))

      # Formatting Values
      res_vals <- c(
        res_vals,
        ds_type,
        len_scale,
        sprintf('%04d', i),
        sprintf('%.4f', beta_mae),
        sprintf('%.4f', beta_rmse),
        sprintf('%.4f', y_mae),
        sprintf('%.4f', y_rmse),
        sprintf('%.3f', as.numeric(
          bktr_regressor$result_logger$total_elapsed_time,units="secs"
        ))
      )
    }
  }
}
df <- as.data.table(matrix(res_vals, ncol = nb_res_cols, byrow = TRUE))
colnames(df) <- res_colnames


[1] "DS Type: Smaller, Lengthscale: 3, Iter: 01"
[1] "DS Type: Smaller, Lengthscale: 3, Iter: 02"
[1] "DS Type: Smaller, Lengthscale: 3, Iter: 03"
[1] "DS Type: Smaller, Lengthscale: 3, Iter: 04"
[1] "DS Type: Smaller, Lengthscale: 3, Iter: 05"
[1] "DS Type: Smaller, Lengthscale: 3, Iter: 06"
[1] "DS Type: Smaller, Lengthscale: 3, Iter: 07"
[1] "DS Type: Smaller, Lengthscale: 3, Iter: 08"
[1] "DS Type: Smaller, Lengthscale: 3, Iter: 09"
[1] "DS Type: Smaller, Lengthscale: 3, Iter: 10"
[1] "DS Type: Smaller, Lengthscale: 6, Iter: 01"
[1] "DS Type: Smaller, Lengthscale: 6, Iter: 02"
[1] "DS Type: Smaller, Lengthscale: 6, Iter: 03"
[1] "DS Type: Smaller, Lengthscale: 6, Iter: 04"
[1] "DS Type: Smaller, Lengthscale: 6, Iter: 05"
[1] "DS Type: Smaller, Lengthscale: 6, Iter: 06"
[1] "DS Type: Smaller, Lengthscale: 6, Iter: 07"
[1] "DS Type: Smaller, Lengthscale: 6, Iter: 08"
[1] "DS Type: Smaller, Lengthscale: 6, Iter: 09"
[1] "DS Type: Smaller, Lengthscale: 6, Iter: 10"
[1] "DS Type: Larger

In [30]:
# Show raw data frame
print(df)

    Dataset_Type Lengthscale   Iter  B_MAE B_RMSE  Y_MAE  Y_RMSE    Time
          <char>      <char> <char> <char> <char> <char>  <char>  <char>
 1:      Smaller           3   0001 0.6558 0.8393 1.1326  1.4576 127.420
 2:      Smaller           3   0002 0.9822 1.3164 2.3322  3.0759 132.336
 3:      Smaller           3   0003 1.1527 1.5496 2.1480  3.0118 135.496
 4:      Smaller           3   0004 1.2915 1.6556 2.0453  2.7797 139.795
 5:      Smaller           3   0005 0.7351 1.1669 2.7534  4.8024 143.883
 6:      Smaller           3   0006 0.9118 1.3825 1.7165  2.1899 128.481
 7:      Smaller           3   0007 0.6255 0.8287 1.1592  1.4429 126.513
 8:      Smaller           3   0008 0.6209 0.9058 1.3572  1.9350 133.246
 9:      Smaller           3   0009 0.5565 0.7159 1.2996  1.6955 132.776
10:      Smaller           3   0010 1.0308 1.6259 1.6270  2.1134 133.928
11:      Smaller           6   0001 0.4101 0.5640 1.0424  1.2888 127.134
12:      Smaller           6   0002 0.8078 1.3730 2

In [31]:

# Aggregate results (Table 6)
mean_fmt <- function(x) sprintf('%.4f', mean(x))
sd_fmt <- function(x) sprintf('%.4f', sd(x))
df <- df[, lapply(.SD, as.numeric), by=list(Dataset_Type, Lengthscale)]
df <- df[, .(
  B_MAE_avg = mean_fmt(B_MAE),
  B_MAE_sd = sd_fmt(B_MAE),
  B_RMSE_avg = mean_fmt(B_RMSE),
  B_RMSE_sd = sd_fmt(B_RMSE),
  Y_MAE_avg = mean_fmt(Y_MAE),
  Y_MAE_sd = sd_fmt(Y_MAE),
  Y_RMSE_avg = mean_fmt(Y_RMSE),
  Y_RMSE_sd = sd_fmt(Y_RMSE),
  Time_avg = mean_fmt(Time),
  Time_sd = sd_fmt(Time)
), by=list(Dataset_Type, Lengthscale)]
setkey(df, Dataset_Type, Lengthscale)
print(df)


Key: <Dataset_Type, Lengthscale>
   Dataset_Type Lengthscale B_MAE_avg B_MAE_sd B_RMSE_avg B_RMSE_sd Y_MAE_avg
         <char>      <char>    <char>   <char>     <char>    <char>    <char>
1:       Larger           3    1.1419   0.6817     1.9719    1.6451    2.4393
2:       Larger           6    0.2288   0.0462     0.3582    0.0792    0.9248
3:      Smaller           3    0.8563   0.2539     1.1987    0.3571    1.7571
4:      Smaller           6    0.5308   0.1364     0.7528    0.2561    1.2309
   Y_MAE_sd Y_RMSE_avg Y_RMSE_sd Time_avg Time_sd
     <char>     <char>    <char>   <char>  <char>
1:   1.5801     3.9950    3.2439 213.7664  4.5943
2:   0.0776     1.1678    0.1020 193.6204  5.9958
3:   0.5472     2.4504    1.0200 133.3874  5.4201
4:   0.4952     1.6830    0.9607 123.8388  5.3432


In [32]:
# Use 2 decimal places for the results
fmt_2dec <- function(x) {
  sprintf('%.2f', as.numeric(x))
}
res_df <- df[, lapply(.SD, fmt_2dec), by = list(Dataset_Type, Lengthscale)]
res_df$DSType_ID <- ifelse(res_df$Dataset_Type=='Smaller', 1, 2)

# Format in B_mae(avg±sd)/B_rmse(avg±sd) Y_mae(avg±sd)/Y_rmse(avg±sd)
res_df <- res_df[, .(
  B_res = paste(B_MAE_avg, '±', B_MAE_sd, '/', B_RMSE_avg, '±', B_RMSE_sd),
  Y_res = paste(Y_MAE_avg, '±', Y_MAE_sd, '/', Y_RMSE_avg, '±', Y_RMSE_sd),
  Time_res = paste(Time_avg, '±', Time_sd)
), by = list(DSType_ID, Dataset_Type, Lengthscale)]
setkey(res_df, DSType_ID, Lengthscale)
res_df

DSType_ID,Dataset_Type,Lengthscale,B_res,Y_res,Time_res
<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
1,Smaller,3,0.86 ± 0.25 / 1.20 ± 0.36,1.76 ± 0.55 / 2.45 ± 1.02,133.39 ± 5.42
1,Smaller,6,0.53 ± 0.14 / 0.75 ± 0.26,1.23 ± 0.50 / 1.68 ± 0.96,123.84 ± 5.34
2,Larger,3,1.14 ± 0.68 / 1.97 ± 1.65,2.44 ± 1.58 / 4.00 ± 3.24,213.77 ± 4.59
2,Larger,6,0.23 ± 0.05 / 0.36 ± 0.08,0.92 ± 0.08 / 1.17 ± 0.10,193.62 ± 6.00


## Results by interpolated segments

In [35]:

# Part for Table 7, Interpolation results by interpolated segments
# Functions for formatting purposes
get_df_errors <- function(
  l_df, l_col, r_df, r_col, loc_subset, time_subset, use_and = TRUE
) {
  if (use_and) {
    l_indx <- which(
      l_df[, location] %in% loc_subset & l_df[, time] %in% time_subset)
    r_indx <- which(
      r_df[, location] %in% loc_subset & r_df[, time] %in% time_subset)
  } else {
    l_indx <- which(
      l_df[, location] %in% loc_subset | l_df[, time] %in% time_subset)
    r_indx <- which(
      r_df[, location] %in% loc_subset | r_df[, time] %in% time_subset)
  }
  err <- unlist(l_df[l_indx, ..l_col] - r_df[r_indx, ..r_col])
  return(c(mean(abs(err)), sqrt(mean(err ** 2))))
}

get_all_errors <- function(
    segment_name,
    i,
    sim_data_df,
    pred_y_df,
    beta_data_df,
    beta_pred_df,
    loc_subset,
    time_subset,
    use_and = TRUE
) {
  y_err <- get_df_errors(sim_data_df, 'y', pred_y_df, 'y_est',
                         loc_subset, time_subset, use_and = use_and)
  beta_colnames <- setdiff(colnames(beta_pred_df), c('location', 'time'))
  beta_err <- get_df_errors(beta_data_df, beta_colnames, beta_pred_df,
      beta_colnames, loc_subset, time_subset, use_and=use_and)

  return(c(
    segment_name,
    sprintf('%04d', i),
    sprintf('%.4f', beta_err[1]),
    sprintf('%.4f', beta_err[2]),
    sprintf('%.4f', y_err[1]),
    sprintf('%.4f', y_err[2])
  ))
}

# Simulation Params
nb_aside_locs <- 10
nb_aside_times <- 20
nb_locs <- 100
nb_times <- 150
len_scale <- 6
spa_cov_means <- c(0, 2, 4)
tem_cov_means <- c(1, 3)

# Set seed and calculation params
TSR$set_params(seed = 1, fp_type = 'float32', fp_device = 'cuda')
res_colnames <- c(
  'Interpol_Segment', 'Iter', 'B_MAE',
  'B_RMSE', 'Y_MAE', 'Y_RMSE'
)
nb_res_cols <- length(res_colnames)
res_vals <- c()

# Run Large simulation for multiple iterations
for (i in 1:10) {
  print(sprintf('Simulation: %02d', i))
  matern_lengthscale <- KernelParameter$new(value = len_scale)
  se_lengthscale <- KernelParameter$new(value = len_scale)
  spatial_kernel <- KernelMatern$new(lengthscale = matern_lengthscale)
  temporal_kernel <- KernelSE$new(lengthscale = se_lengthscale)

  simu_data <- simulate_spatiotemporal_data(
    nb_locations = nb_locs,
    nb_time_points = nb_times,
    nb_spatial_dimensions = 2,
    spatial_scale = 10,
    time_scale = 10,
    spatial_covariates_means = spa_cov_means,
    temporal_covariates_means = tem_cov_means,
    spatial_kernel = spatial_kernel,
    temporal_kernel = temporal_kernel,
    noise_variance_scale = 1
  )

  # Set some values aside for M_new locs and N_new times
  obs_nb_locs <- nb_locs - nb_aside_locs
  obs_nb_times <- nb_times - nb_aside_times

  data_df <- simu_data$data_df
  spatial_pos_df <- simu_data$spatial_positions_df
  temporal_pos_df <- simu_data$temporal_positions_df

  all_locs <- spatial_pos_df$location
  all_times <- temporal_pos_df$time
  obs_locs <- sample(all_locs, obs_nb_locs)
  new_locs <- setdiff(all_locs, obs_locs)
  obs_times <- sample(all_times, obs_nb_times)
  new_times <- setdiff(all_times, obs_times)

  obs_data_df <- data_df[
    data_df[, .I[location %in% obs_locs & time %in% obs_times]], ]
  obs_spatial_pos_df <- spatial_pos_df[
    spatial_pos_df[, .I[location %in% obs_locs]], ]
  obs_temporal_pos_df <- temporal_pos_df[
    temporal_pos_df[, .I[time %in% obs_times]], ]

  new_data_df <- data_df[
    data_df[, .I[location %in% new_locs | time %in% new_times]], ]
  new_spatial_positions_df <- spatial_pos_df[
    spatial_pos_df[, .I[location %in% new_locs]], ]
  new_temporal_positions_df <- temporal_pos_df[
    temporal_pos_df[, .I[time %in% new_times]], ]

  # Run mcmc sampling
  bktr_regressor <- BKTRRegressor$new(
    data_df = obs_data_df,
    spatial_kernel = KernelMatern$new(),
    spatial_positions_df = obs_spatial_pos_df,
    temporal_kernel = KernelSE$new(),
    temporal_positions_df = obs_temporal_pos_df,
    burn_in_iter = 1000,
    sampling_iter = 500,
    has_geo_coords = FALSE
  )
  # Hide output of sampling because its volume creates notebook errors
  .unused_out <- capture.output(bktr_regressor$mcmc_sampling())

  # Run interpolation
  preds <- bktr_regressor$predict(
    new_data_df,
    new_spatial_positions_df,
    new_temporal_positions_df
  )

  # Align both datasets
  sim_data_df <- simu_data$data_df
  pred_y_df <- preds$new_y_df
  beta_data_df <- simu_data$beta_df
  beta_pred_df <- preds$new_beta_df
  setkey(beta_pred_df, location, time)
  sim_y_df <- sim_data_df[
    sim_data_df[, .I[location %in% new_locs | time %in% new_times]],
    c('location', 'time', 'y')
  ]
  setorderv(pred_y_df, c('location', 'time'))
  setorderv(sim_y_df, c('location', 'time'))

  # Formatting Values
  res_vals <- c(
    res_vals,
    get_all_errors('1_new_spa',  i, sim_data_df, pred_y_df, beta_data_df,
                   beta_pred_df, new_locs, obs_times),
    get_all_errors('2_new_temp', i, sim_data_df, pred_y_df, beta_data_df,
                   beta_pred_df, obs_locs, new_times),
    get_all_errors('3_new_both', i, sim_data_df, pred_y_df, beta_data_df,
                   beta_pred_df, new_locs, new_times),
    get_all_errors('new_total', i, sim_data_df, pred_y_df, beta_data_df,
                   beta_pred_df, new_locs, new_times, use_and = FALSE)
  )
}
df <- as.data.table(matrix(res_vals, ncol = nb_res_cols, byrow = TRUE))
colnames(df) <- res_colnames


[1] "Simulation: 01"
[1] "Simulation: 02"
[1] "Simulation: 03"
[1] "Simulation: 04"
[1] "Simulation: 05"
[1] "Simulation: 06"
[1] "Simulation: 07"
[1] "Simulation: 08"
[1] "Simulation: 09"
[1] "Simulation: 10"


In [36]:
# Show raw data frame
print(df)

    Interpol_Segment   Iter  B_MAE B_RMSE  Y_MAE Y_RMSE
              <char> <char> <char> <char> <char> <char>
 1:        1_new_spa   0001 0.2307 0.3761 0.8881 1.1049
 2:       2_new_temp   0001 0.2397 0.4100 0.8009 1.0124
 3:       3_new_both   0001 0.2304 0.3803 0.8707 1.1037
 4:        new_total   0001 0.2356 0.3952 0.8395 1.0554
 5:        1_new_spa   0002 0.1943 0.2778 0.8389 1.0530
 6:       2_new_temp   0002 0.2009 0.2962 0.8278 1.0230
 7:       3_new_both   0002 0.1998 0.2822 0.8293 1.0820
 8:        new_total   0002 0.1982 0.2882 0.8323 1.0385
 9:        1_new_spa   0003 0.1804 0.2740 0.8556 1.0710
10:       2_new_temp   0003 0.1731 0.2828 0.8123 1.0235
11:       3_new_both   0003 0.1697 0.2591 0.8770 1.0660
12:        new_total   0003 0.1758 0.2780 0.8333 1.0450
13:        1_new_spa   0004 0.2618 0.3727 1.0707 1.3258
14:       2_new_temp   0004 0.2105 0.3277 0.8230 1.0323
15:       3_new_both   0004 0.2569 0.3679 1.0375 1.2641
16:        new_total   0004 0.2335 0.3486 0.9336

In [37]:
# Aggregate results (Table 7)
mean_fmt <- function(x) sprintf('%.4f', mean(x))
sd_fmt <- function(x) sprintf('%.4f', sd(x))
df <- df[, lapply(.SD, as.numeric), by=list(Interpol_Segment)]
df <- df[, .(
    B_MAE_avg = mean_fmt(B_MAE),
    B_MAE_sd = sd_fmt(B_MAE),
    B_RMSE_avg = mean_fmt(B_RMSE),
    B_RMSE_sd = sd_fmt(B_RMSE),
    Y_MAE_avg = mean_fmt(Y_MAE),
    Y_MAE_sd = sd_fmt(Y_MAE),
    Y_RMSE_avg = mean_fmt(Y_RMSE),
    Y_RMSE_sd = sd_fmt(Y_RMSE)
), by=list(Interpol_Segment)]
setkey(df, Interpol_Segment)
print(df)

Key: <Interpol_Segment>
   Interpol_Segment B_MAE_avg B_MAE_sd B_RMSE_avg B_RMSE_sd Y_MAE_avg Y_MAE_sd
             <char>    <char>   <char>     <char>    <char>    <char>   <char>
1:        1_new_spa    0.2318   0.0569     0.3533    0.0865    1.1073   0.3337
2:       2_new_temp    0.2078   0.0346     0.3394    0.0618    0.8120   0.0111
3:       3_new_both    0.2302   0.0550     0.3529    0.0851    1.0681   0.2693
4:        new_total    0.2186   0.0417     0.3471    0.0670    0.9439   0.1466
   Y_RMSE_avg Y_RMSE_sd
       <char>    <char>
1:     1.4977    0.6633
2:     1.0205    0.0092
3:     1.4234    0.4929
4:     1.2728    0.3707


In [38]:
# Use 2 decimal places for the results
res_df <- df[, lapply(.SD, fmt_2dec), by = list(Interpol_Segment)]

# Format in B_mae(avg±sd)/B_rmse(avg±sd) Y_mae(avg±sd)/Y_rmse(avg±sd)
res_df <- res_df[, .(
  B_res = paste(B_MAE_avg, '±', B_MAE_sd, '/', B_RMSE_avg, '±', B_RMSE_sd),
  Y_res = paste(Y_MAE_avg, '±', Y_MAE_sd, '/', Y_RMSE_avg, '±', Y_RMSE_sd)
), by = list(Interpol_Segment)]
res_df

Interpol_Segment,B_res,Y_res
<chr>,<chr>,<chr>
1_new_spa,0.23 ± 0.06 / 0.35 ± 0.09,1.11 ± 0.33 / 1.50 ± 0.66
2_new_temp,0.21 ± 0.03 / 0.34 ± 0.06,0.81 ± 0.01 / 1.02 ± 0.01
3_new_both,0.23 ± 0.06 / 0.35 ± 0.09,1.07 ± 0.27 / 1.42 ± 0.49
new_total,0.22 ± 0.04 / 0.35 ± 0.07,0.94 ± 0.15 / 1.27 ± 0.37
