# Presence only data modelling
Build presence only models using presence and background data for training; and absence and pseudo-absence (until we get enough true absences) for testing. 

## Downloads and imports

In [16]:
install.packages(c("dismo","maptools","glmnet","maxnet","raster","sp","pryr","tune","tidyverse","tictoc","workflows","ROCR","MLmetrics"))

also installing the dependencies ‘listenv’, ‘parallelly’, ‘future’, ‘warp’, ‘DiceDesign’, ‘glue’, ‘tidyselect’, ‘pillar’, ‘lhs’, ‘globals’, ‘tidyr’, ‘ipred’, ‘furrr’, ‘slider’, ‘ellipsis’, ‘pROC’, ‘cli’, ‘dials’, ‘dplyr’, ‘generics’, ‘GPfit’, ‘hardhat’, ‘lifecycle’, ‘parsnip’, ‘recipes’, ‘rlang’, ‘rsample’, ‘tibble’, ‘vctrs’, ‘workflows’, ‘yardstick’

Updating HTML index of packages in '.Library'
Making 'packages.html' ... done


In [1]:
library(maxnet)
library(glmnet)
library(dismo)
library(tidyverse)   # packages for modeling and statistical analysis
library(tune)         # For hyperparemeter tuning
library(tictoc)       # for timimg
library(workflows)    # streamline process
library(parsnip)
library(ROCR)
library(MLmetrics)

Loading required package: Matrix
Loaded glmnet 4.1-2
Loading required package: raster
Loading required package: sp
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──
[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.4     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mtidyr[39m::[32mexpand()[39m  masks [34mMatrix[39m::expand()
[31m✖[39m [34mtidyr[39m::[32mextract()[39m masks [34mraster[39m::extract()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m  masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m     masks [34mstats[39m::lag()
[31m✖[39m [34mtidyr[39m::[32mpack()[39m    masks [34mMatrix[39m::pack

## Helper functions

In [13]:
replace_na_with_mean <- function(dataframe) {
    for(i in 1:ncol(dataframe)) {                                   # Replace NA in all columns
        dataframe[ , i][is.na(dataframe[ , i])] <- mean(dataframe[ , i], na.rm = TRUE)
    }
    return(dataframe)
}

In [14]:
train_test_evaluate <- function(training_data,test_data,classes,regmult,plot_diagram) {
    # Generate and normalize train data
    train_x <- training_data[, -which(names(training_data) == "presence")]


    ## Remove nans
    train_x = replace_na_with_mean(train_x) 
    
    ## Train model
    maxent_model_for_run <- maxnet(training_data$presence, train_x, maxnet.formula(training_data$presence, train_x, classes=classes),regmult=regmult)
    
    # Generate and normalize test data
    ## Ignore real absences for now - we eval on pseudo absences - ignore presence=2
    test_data = test_data[test_data$presence %in% c("0", "1"), ]

    test_x <- test_data[, -which(names(test_data) == "presence")]

    # Remove nans
    test_x = replace_na_with_mean(test_x) 

    # Test model
    test_pred = predict(maxent_model_for_run, test_x, clamp=T, type="cloglog")
    auc <- AUC(y_pred=test_pred, y_true=test_data$presence)
    accuracy <- Accuracy(y_pred=ifelse(test_pred >= .5, 1, 0), y_true=test_data$presence)
    f1 <- F1_Score(y_pred=ifelse(test_pred >= .5, 1, 0), y_true=test_data$presence)
    result_test <- data.frame(accuracy,f1,auc)

    return(result_test)
}

train_test_evaluate_existing_model <- function(maxent_model,test_data,plot_diagram) {
    tic()
    # Generate and normalize test data
    ## Ignore real absences for now - we eval on pseudo absences - ignore presence=2
    test_data = test_data[test_data$presence %in% c("0", "1"), ]

    test_x <- test_data[, -which(names(test_data) == "presence")]

    # Remove nans
    test_x = replace_na_with_mean(test_x) 

    # Test model
    test_pred = predict(maxent_model, test_x, clamp=T, type="cloglog")
    auc <- AUC(y_pred=test_pred, y_true=test_data$presence)
    accuracy <- Accuracy(y_pred=ifelse(test_pred >= .5, 1, 0), y_true=test_data$presence)
    f1 <- F1_Score(y_pred=ifelse(test_pred >= .5, 1, 0), y_true=test_data$presence)
    result_test <- data.frame(accuracy,f1,auc)
    toc()
    return(result_test)
}

In [15]:
# randomly sample from dataframe
sample_from_dataset <- function(dataset,sample_percentage) {
    num_training_samples = nrow(dataset)
    smp_size <- floor(sample_percentage * num_training_samples)
    train_ind <- sample(seq_len(num_training_samples), size = smp_size)
    return(dataset[train_ind, ])
}

## Build models

### Loading data 

In [6]:
# Load preprocessed and scaled data
train <- read.csv("../data/processed_and_scaled/scaled_train.csv", header=TRUE)
validation <- read.csv("../data/processed_and_scaled/scaled_val.csv", header=TRUE)

In [7]:
sprintf("Validation is %s of train set. Presence points is %s, psuedo-absense/background is %s ", nrow(validation)/(nrow(train)+nrow(validation)),nrow(validation[validation$presence=="1",])/nrow(validation), nrow(validation[validation$presence=="0",])/nrow(validation))

In [9]:
# Get presence only
nrow(train)
nrow(validation)

###  Hyperparam Search
Use train and validation to find best hyparams

In [8]:
colnames(train)

#### Use the train and validation set to find good hyperparams (or a subset of train and validation if this takes too long)

In [24]:
# We need to remove some columns since Maxent doesn't seem to converge with high dimentions + lots of samples
cols_to_remove = c("Tveg","Wind","Rainf","X")
train_v2 = train %>% select(-contains(cols_to_remove))
val_v2 = validation %>% select(-contains(cols_to_remove))

# Only use some of the training data - this takes too long while all the training data
train_sample = sample_from_dataset(train_v2,0.25)
validation_sample = val_v2

print(nrow(train_sample))
print(ncol(train_sample))

# Create search grid
# Classess impact function expressivity - https://github.com/mrmaxent/maxnet/blob/d4ec82566992d49fa4371bf1f4e818a1031bfe58/R/maxnet.formula.R#L3 & 
#   - https://cran.r-project.org/web/packages/maxnet/maxnet.pdf
# Regmult is the regularization factor - default is 1
# search_grid <- expand.grid(classes=c("l","lq","lqh"),regmult=seq(0.10, 1.10, by=0.10))
search_grid <- expand.grid(classes=c("l","lq","lqh","h","t","p"),regmult=1.0)
nrow(search_grid)

best_auc = 0
# Loop through search grid and find best model and params
for(row in 1:nrow(search_grid)){
  tic()
    
  # Get vars
  regmult <- search_grid$regmult[row]
  classes <- search_grid$classes[row]
    
  cat(sprintf("Running config %s out of %s - reg mult:%s classes:%s \n", row,nrow(search_grid), regmult,classes))
  result = train_test_evaluate(train_sample,validation_sample,classes,regmult,plot_diagram=FALSE)
  print(result)
  if(best_auc < result$auc){
   best_result <- result
   best_class <- classes
   best_regmult <- regmult
   best_auc <-  result$auc
   print("New best model")
   print(best_result)
  }
  
  flush.console()
  toc()
}

[1] 8639
[1] 133


Running config 1 out of 6 - reg mult:1 classes:l 
   accuracy        f1       auc
1 0.7667824 0.7469547 0.8353039
[1] "New best model"
   accuracy        f1       auc
1 0.7667824 0.7469547 0.8353039
81.443 sec elapsed
Running config 2 out of 6 - reg mult:1 classes:lq 
   accuracy        f1      auc
1 0.8097222 0.8075843 0.876299
[1] "New best model"
   accuracy        f1      auc
1 0.8097222 0.8075843 0.876299
72.635 sec elapsed
Running config 3 out of 6 - reg mult:1 classes:lqh 
   accuracy        f1       auc
1 0.8395833 0.8428215 0.9141268
[1] "New best model"
   accuracy        f1       auc
1 0.8395833 0.8428215 0.9141268
132.859 sec elapsed
Running config 4 out of 6 - reg mult:1 classes:h 
   accuracy        f1      auc
1 0.8390046 0.8419857 0.913971
131.174 sec elapsed
Running config 5 out of 6 - reg mult:1 classes:t 
  accuracy        f1       auc
1 0.849537 0.8517674 0.9202665
[1] "New best model"
  accuracy        f1       auc
1 0.849537 0.8517674 0.9202665
96.63 sec elapsed
R

#### Use best params found on train & val sets, and use those on the test set

In [32]:
print(best_class)
print(best_regmult)

[1] t
Levels: l lq lqh h t p
[1] 1


In [34]:
# Train model with best parameters
tic()
cols_to_remove = c("Tveg","Wind","Rainf","X")
training_data = rbind(validation, train)

training_data_subset_cols <- training_data %>% select(-contains(cols_to_remove))

all_results <- data.frame(index=integer(),accuracy=double(),f1=double(),
                 auc=double()) 

train_x <- training_data_subset_cols[, -which(names(training_data_subset_cols) == "presence")]

## Remove nans
train_x = replace_na_with_mean(train_x) 

## Train model
maxent_model <- maxnet(training_data_subset_cols$presence, train_x, maxnet.formula(training_data_subset_cols$presence, train_x, classes=best_class),regmult=best_regmult)
toc()

for (i in seq(7, 700, by=7)){
    tic()
    test_filename <- sprintf("../data/test_data/test_scaled_seed_%s.csv", i)
    
    test_data <- read.csv(test_filename, header=TRUE)
    
    test_data <- test_data %>% select(-contains(cols_to_remove))
    
    test_result <- train_test_evaluate_existing_model(maxent_model,test_data,plot_diagram=FALSE)
    test_result[, "seed"] <- i
    all_results = rbind(all_results, test_result)
    
    flush.console()
    toc()
}

print(all_results)
results_filename <- sprintf("../results/presence-only/maxent_%s_class_%s_reg.csv", best_class,best_regmult)
write.csv(all_results,results_filename,row.names=FALSE)

1929.156 sec elapsed
0.346 sec elapsed
0.957 sec elapsed
0.349 sec elapsed
0.974 sec elapsed
0.382 sec elapsed
1.012 sec elapsed
0.358 sec elapsed
1.005 sec elapsed
0.354 sec elapsed
0.997 sec elapsed
0.349 sec elapsed
1.558 sec elapsed
0.34 sec elapsed
1.009 sec elapsed
0.352 sec elapsed
1.006 sec elapsed
0.786 sec elapsed
1.441 sec elapsed
0.344 sec elapsed
1.024 sec elapsed
0.784 sec elapsed
1.474 sec elapsed
1.754 sec elapsed
2.434 sec elapsed
0.344 sec elapsed
1.071 sec elapsed
0.345 sec elapsed
1.59 sec elapsed
0.351 sec elapsed
2.49 sec elapsed
0.351 sec elapsed
0.958 sec elapsed
0.352 sec elapsed
0.964 sec elapsed
0.353 sec elapsed
0.968 sec elapsed
0.342 sec elapsed
0.967 sec elapsed
0.955 sec elapsed
1.611 sec elapsed
0.358 sec elapsed
1.033 sec elapsed
0.352 sec elapsed
1.004 sec elapsed
0.337 sec elapsed
0.974 sec elapsed
0.332 sec elapsed
1.443 sec elapsed
0.755 sec elapsed
1.4 sec elapsed
1.783 sec elapsed
2.453 sec elapsed
0.357 sec elapsed
1.122 sec elapsed
0.34 sec ela

#### This configuration worked best on test data - even though not nec best on train and validation

In [22]:
best_class <- "l"
best_regmult <- 1 

tic()
cols_to_remove = c("Tveg","Wind","Rainf","X")
training_data = rbind(validation, train)

training_data_subset_cols <- training_data %>% select(-contains(cols_to_remove))

all_results <- data.frame(index=integer(),accuracy=double(),f1=double(),
                 auc=double()) 

train_x <- training_data_subset_cols[, -which(names(training_data_subset_cols) == "presence")]

## Remove nans
train_x = replace_na_with_mean(train_x) 

## Train model
maxent_model <- maxnet(training_data_subset_cols$presence, train_x, maxnet.formula(training_data_subset_cols$presence, train_x, classes=best_class),regmult=best_regmult)
toc()

for (i in seq(7, 700, by=7)){
    tic()
    test_filename <- sprintf("../data/test_data/test_scaled_seed_%s.csv", i)
    
    test_data <- read.csv(test_filename, header=TRUE)
    
    test_data <- test_data %>% select(-contains(cols_to_remove))
    
    test_result <- train_test_evaluate_existing_model(maxent_model,test_data,plot_diagram=FALSE)
    test_result[, "seed"] <- i
    all_results = rbind(all_results, test_result)
    
    flush.console()
    toc()
}

print(all_results)
results_filename <- sprintf("../results/presence-only/maxent_%s_class_%s_reg.csv", best_class,best_regmult)
write.csv(all_results,results_filename,row.names=FALSE)

1748.757 sec elapsed
0.079 sec elapsed
0.664 sec elapsed
0.138 sec elapsed
0.768 sec elapsed
0.077 sec elapsed
0.979 sec elapsed
0.078 sec elapsed
0.721 sec elapsed
0.086 sec elapsed
0.726 sec elapsed
0.518 sec elapsed
1.164 sec elapsed
0.077 sec elapsed
0.742 sec elapsed
0.077 sec elapsed
0.727 sec elapsed
0.525 sec elapsed
1.182 sec elapsed
0.077 sec elapsed
0.751 sec elapsed
0.077 sec elapsed
0.74 sec elapsed
0.078 sec elapsed
1.182 sec elapsed
0.081 sec elapsed
2.152 sec elapsed
0.081 sec elapsed
0.705 sec elapsed
0.077 sec elapsed
0.701 sec elapsed
0.078 sec elapsed
0.705 sec elapsed
0.528 sec elapsed
1.161 sec elapsed
0.076 sec elapsed
0.741 sec elapsed
0.077 sec elapsed
0.717 sec elapsed
0.077 sec elapsed
0.723 sec elapsed
0.549 sec elapsed
1.201 sec elapsed
0.076 sec elapsed
0.746 sec elapsed
0.077 sec elapsed
0.727 sec elapsed
0.076 sec elapsed
1.173 sec elapsed
0.078 sec elapsed
2.099 sec elapsed
0.077 sec elapsed
0.774 sec elapsed
0.077 sec elapsed
2 sec elapsed
0.076 sec el