# Presence only data modelling
Build presence only models using presence and background data for training; and absence and pseudo-absence (until we get enough true absences) for evaluation. 

## Downloads and imports

In [16]:
install.packages(c("dismo","maptools","glmnet","maxnet","raster","sp","pryr","tune","tidyverse","tictoc","workflows","ROCR","MLmetrics"))

also installing the dependencies ‘listenv’, ‘parallelly’, ‘future’, ‘warp’, ‘DiceDesign’, ‘glue’, ‘tidyselect’, ‘pillar’, ‘lhs’, ‘globals’, ‘tidyr’, ‘ipred’, ‘furrr’, ‘slider’, ‘ellipsis’, ‘pROC’, ‘cli’, ‘dials’, ‘dplyr’, ‘generics’, ‘GPfit’, ‘hardhat’, ‘lifecycle’, ‘parsnip’, ‘recipes’, ‘rlang’, ‘rsample’, ‘tibble’, ‘vctrs’, ‘workflows’, ‘yardstick’

Updating HTML index of packages in '.Library'
Making 'packages.html' ... done


In [1]:
library(maxnet)
library(glmnet)
library(dismo)
library(tidyverse)   # packages for modeling and statistical analysis
library(tune)         # For hyperparemeter tuning
library(tictoc)       # for timimg
library(workflows)    # streamline process
library(parsnip)
library(ROCR)
library(MLmetrics)

Loading required package: Matrix
Loaded glmnet 4.1-2
Loading required package: raster
Loading required package: sp
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──
[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.4     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mtidyr[39m::[32mexpand()[39m  masks [34mMatrix[39m::expand()
[31m✖[39m [34mtidyr[39m::[32mextract()[39m masks [34mraster[39m::extract()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m  masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m     masks [34mstats[39m::lag()
[31m✖[39m [34mtidyr[39m::[32mpack()[39m    masks [34mMatrix[39m::pack

## Helper functions

In [2]:
replace_na_with_mean <- function(dataframe) {
    for(i in 1:ncol(dataframe)) {                                   # Replace NA in all columns
        dataframe[ , i][is.na(dataframe[ , i])] <- mean(dataframe[ , i], na.rm = TRUE)
    }
    return(dataframe)
}

In [3]:
train_test_evaluate <- function(training_data,test_data,classes,regmult,plot_diagram) {
    tic()
    # Generate and normalize train data
    train_x <- training_data[, -which(names(training_data) == "presence")]


    ## Remove nans
    train_x = replace_na_with_mean(train_x) 
    
    ## Train model
    maxent_model <- maxnet(training_data$presence, train_x, maxnet.formula(training_data$presence, train_x, classes=classes),regmult=regmult)
    
    # Generate and normalize test data
    ## Ignore real absences for now - we eval on pseudo absences - ignore presence=2
    test_data = test_data[test_data$presence %in% c("0", "1"), ]

    test_x <- test_data[, -which(names(test_data) == "presence")]

    # Remove nans
    test_x = replace_na_with_mean(test_x) 

    # Test model
    test_pred = predict(maxent_model, test_x, clamp=T, type="cloglog")
    auc <- AUC(test_pred, test_data$presence)
    accuracy <- Accuracy(ifelse(test_pred >= .5, 1, 0), test_data$presence)
    f1 <- F1_Score(ifelse(test_pred >= .5, 1, 0), test_data$presence)
    result_test <- data.frame(accuracy,f1,auc)
    toc()
    return(result_test)
}

## Build models

### Loading data 

In [4]:
# Load preprocessed and scaled data
train <- read.csv("../data/processed_and_scaled/scaled_train.csv", header=TRUE)
validation <- read.csv("../data/processed_and_scaled/scaled_val.csv", header=TRUE)

In [5]:
sprintf("Validation is %s of train set. Presence points is %s, psuedo-absense/background is %s ", nrow(validation)/(nrow(train)+nrow(validation)),nrow(validation[validation$presence=="1",])/nrow(validation), nrow(validation[validation$presence=="0",])/nrow(validation))

###  Hyperparam Search
Use train and validation to find best hyparams

In [None]:
# Create search grid
search_grid <- expand.grid(classes=c("default","l","lq"),regmult=seq(0.05, 1, by=0.10))
nrow(search_grid)

best_auc = 0
# Loop through search grid and find best model and params
for(row in 1:nrow(search_grid)){
  tic()
    
  # Get vars
  regmult <- search_grid$regmult[row]
  classes <- search_grid$classes[row]
    
  cat(sprintf("Running config %s out of %s - reg mult:%s classes:%s \n", row,nrow(search_grid), regmult,classes))
  result = train_test_evaluate(train,validation,classes,regmult,plot_diagram=FALSE)
  if(best_auc < result$auc){
   best_result <- result
   best_class = classes
   best_regmult = regmult
   print("New best model")
   print(best_result)
  }
  
  flush.console()
  toc()
}

Use best params found on test sets with different seeds

In [None]:
training_data = rbind(validation, train)

all_results <- data.frame(index=integer(),accuracy=double(),f1=double(),
                 auc=double()) 

for (i in 0:29){
    tic()
    test_filename <- sprintf("../data/processed_and_scaled/scaled_test_%s.csv", i)
    test_data <- read.csv(test_filename, header=TRUE)
    
    test_result <- train_test_evaluate(training_data,test_data,best_class,best_regmult,plot_diagram=FALSE)
    test_result[, "index"] <- i
    all_results = rbind(all_results, test_result)
    
    flush.console()
    toc()
}

In [None]:
all_results<-all_results[,c("index","accuracy","f1","auc")]
all_results

In [None]:
write.csv(all_results,'maxent.csv',row.names=FALSE)