In [None]:
# Copyright 2021, Battelle Energy Alliance, LLC

library(prospectr)
library(reticulate)
library(dotenv)
library(jsonlite)

In [None]:
# Split working directory path
pwd = getwd()
split_path = function(x) if (dirname(x)==x) x else c(basename(x),split_path(dirname(x)))
path_list = split_path(pwd)

# Change working directory
if (path_list[1] == "split"){
    new_pwd = ""
    for (i in 3:length(path_list)-1) {
        new_pwd = paste(path_list[i], new_pwd, sep=path_list[length(path_list)])  
    }
    new_pwd = paste(path_list[length(path_list)], new_pwd, sep="")
    setwd(new_pwd)
}

In [None]:
# Load ML Adapter data
load_dot_env(file = ".env")
file_path = Sys.getenv("ML_ADAPTER_OBJECT_LOCATION")
data = fromJSON(txt=file_path)

In [None]:
# Determine X by reading the .csv file
split_file = data$DATASET
dataset = read.csv(split_file, check.names=FALSE)
#print(dim(dataset))

In [None]:
# Filter dataset to contain only numeric columns
dataset_numeric = dataset[sapply(dataset, is.numeric)]
#print(dim(dataset_numeric))

In [None]:
# Determine N and k

split_methods = Sys.getenv("SPLIT")
split_methods = fromJSON(txt=split_methods)
N = split_methods$kennard_stone$N
k = split_methods$kennard_stone$k

In [None]:
# Take a sample of the dataset
if (nrow(dataset_numeric) > N){
    set.seed(10000)
    X = dataset_numeric[sample(nrow(dataset_numeric), N),]
    set.seed(10000)
    X_full = dataset[sample(nrow(dataset), N),]
} else{
    X = dataset_numeric
}  
#print(dim(X))

In [None]:
# Determine k proportionately if N is greater than the rows of X.
if (N > nrow(X)){
    fraction = k / N
    k = ceiling(nrow(X) * fraction)
    #print(fraction)
    #print(k)
}

In [None]:
# Run kennard stone algorithm
start_time <- Sys.time()
selection = kenStone(X, k=k, metric = "euclid")
end_time = Sys.time()
#print(end_time - start_time)

In [None]:
# Get training and testing file paths
pwd = getwd()
split_path = function(x) if (dirname(x)==x) x else c(basename(x),split_path(dirname(x)))
path_list = split_path(pwd)
training_path = paste(pwd, "data", "training_set.csv", sep=path_list[length(path_list)])
testing_path = paste(pwd, "data", "testing_set.csv", sep=path_list[length(path_list)])

dataset_indices = as.numeric(rownames(dataset))
train_indices = as.numeric(rownames(X_full[selection$model,]))
test_indices = setdiff(dataset_indices, train_indices)
# Write training and testing sets to a csv file
write.csv(x=dataset[train_indices,], file=training_path, row.names=FALSE)
write.csv(x=dataset[test_indices,], file=testing_path, row.names=FALSE)