# Presence only data modelling
Build presence only models using presence and background data for training; and absence and pseudo-absence (until we get enough true absences) for evaluation. 

## Downloads and imports

In [16]:
install.packages(c("dismo","maptools","glmnet","maxnet","raster","sp","pryr","tune","tidyverse","tictoc","workflows","ROCR"))

also installing the dependencies ‘listenv’, ‘parallelly’, ‘future’, ‘warp’, ‘DiceDesign’, ‘glue’, ‘tidyselect’, ‘pillar’, ‘lhs’, ‘globals’, ‘tidyr’, ‘ipred’, ‘furrr’, ‘slider’, ‘ellipsis’, ‘pROC’, ‘cli’, ‘dials’, ‘dplyr’, ‘generics’, ‘GPfit’, ‘hardhat’, ‘lifecycle’, ‘parsnip’, ‘recipes’, ‘rlang’, ‘rsample’, ‘tibble’, ‘vctrs’, ‘workflows’, ‘yardstick’

Updating HTML index of packages in '.Library'
Making 'packages.html' ... done


In [1]:
library(maxnet)
library(glmnet)
library(dismo)
library(tidyverse)   # packages for modeling and statistical analysis
library(tune)         # For hyperparemeter tuning
library(tictoc)       # for timimg
library(workflows)    # streamline process
library(parsnip)
library(ROCR)

Loading required package: Matrix
Loaded glmnet 4.1-2
Loading required package: raster
Loading required package: sp
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──
[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.4     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mtidyr[39m::[32mexpand()[39m  masks [34mMatrix[39m::expand()
[31m✖[39m [34mtidyr[39m::[32mextract()[39m masks [34mraster[39m::extract()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m  masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m     masks [34mstats[39m::lag()
[31m✖[39m [34mtidyr[39m::[32mpack()[39m    masks [34mMatrix[39m::pack

## Helper functions

In [2]:
normalize <- function(x, min =0, max=1, na.rm = TRUE) {
    X_std <- (x- min(x)) /(max(x)-min(x))
    return(X_std * (max - min) + min)
}

In [3]:
replace_na_with_mean <- function(dataframe) {
    for(i in 1:ncol(dataframe)) {                                   # Replace NA in all columns
        dataframe[ , i][is.na(dataframe[ , i])] <- mean(dataframe[ , i], na.rm = TRUE)
    }
    return(dataframe)
}

In [4]:
train_test_evaluate <- function(training_data,test_data,classes,regmult,plot_diagram) {
    tic()
    # Generate and normalize train data
    train_x <- training_data[, -which(names(training_data) == "presence")]

    ## Remove nans
    train_x = replace_na_with_mean(train_x) 

    ## Normalize 
    train_x = normalize(train_x)
    
    ## Train model
    maxent_model <- maxnet(training_data$presence, train_x, maxnet.formula(training_data$presence, train_x, classes=classes),regmult=regmult)
    
    # Generate and normalize test data
    ## Ignore real absences for now - we eval on pseudo absences - ignore presence=2
    test_data = test_data[test_data$presence %in% c("0", "1"), ]

    test_x <- test_data[, -which(names(test_data) == "presence")]

    # Remove nans
    test_x = replace_na_with_mean(test_x) 

    # Normalize 
    test_x = normalize(test_x)

    # Test model
    result_test <- evaluate(test_x[test_data$presence == "1",],test_x[test_data$presence == "0",],maxent_model)
    if(plot_diagram){
        plot(result_test,'ROC')
    }
    toc()
    return(result_test)
}

## Loading data 

In [5]:
# Load preprocessed data
training_data <- read.csv("../data/preprocessed_train_val_random.csv", header=TRUE)
test_data  <- read.csv("../data/preprocessed_test_random.csv", header=TRUE)


# Presense only train data
train_presence <- training_data[training_data$presence == "1", ]

# Read already generated background data
background_full <- read.csv("../data/preprocessed_background_data_updated.csv_full.csv", header=TRUE)

# combine presence and background
training_data <- rbind(train_presence, background_full) 

In [6]:
# ignore first index column
training_data = training_data[,2:ncol(training_data)]
test_data = test_data[,2:ncol(test_data)]

In [7]:
head(training_data)

AvgSurfT_inst_bucket_1,AvgSurfT_inst_bucket_2,AvgSurfT_inst_bucket_3,AvgSurfT_inst_bucket_4,AvgSurfT_inst_bucket_5,AvgSurfT_inst_bucket_6,AvgSurfT_inst_bucket_7,AvgSurfT_inst_bucket_8,AvgSurfT_inst_bucket_9,AvgSurfT_inst_bucket_10,...,SoilTMP10_40cm_inst_bucket_8,SoilTMP10_40cm_inst_bucket_9,SoilTMP10_40cm_inst_bucket_10,SoilTMP10_40cm_inst_bucket_11,SoilTMP10_40cm_inst_bucket_12,SoilTMP10_40cm_inst_bucket_13,SoilTMP10_40cm_inst_bucket_14,sand_0.5cm_mean,sand_5.15cm_mean,presence
301.2946,300.712,301.1873,303.945,300.6749,302.7356,304.7987,303.979,306.4119,308.0052,...,305.3548,306.8133,308.2266,308.759,309.1228,309.865,309.2894,0.5876362,0.5805887,1
301.2946,300.712,301.1873,303.945,300.6749,302.7356,304.7987,303.979,306.4119,308.0052,...,305.3548,306.8133,308.2266,308.759,309.1228,309.865,309.2894,0.5871003,0.5799854,1
301.2946,300.712,301.1873,303.945,300.6749,302.7356,304.7987,303.979,306.4119,308.0052,...,305.3548,306.8133,308.2266,308.759,309.1228,309.865,309.2894,0.5868201,0.5797687,1
299.7526,301.1066,300.7033,302.7293,301.4368,301.1487,303.9104,304.342,306.3383,308.3056,...,305.2167,307.0625,308.6653,308.9538,309.1529,310.0603,309.6583,0.5807767,0.5741047,1
299.7526,301.1066,300.7033,302.7293,301.4368,301.1487,303.9104,304.342,306.3383,308.3056,...,305.2167,307.0625,308.6653,308.9538,309.1529,310.0603,309.6583,0.5800296,0.573147,1
299.282,302.043,300.7356,302.7871,302.2794,301.1952,303.8422,304.9244,305.841,307.3593,...,305.3094,306.6612,307.7801,308.5342,308.8986,309.6441,309.5571,0.5868835,0.5801188,1


## Build models

### V1 - Train, valdation, test split 
Version where we split train into train and validation for hyperparam tuning. 

In [13]:
# Split train data into train and validation
set.seed(42)
## 75% of the sample size
num_training_samples = nrow(training_data)
smp_size <- floor(0.75 * num_training_samples)

## set the seed to make your partition reproducible

train_ind <- sample(seq_len(num_training_samples), size = smp_size)

train      <- training_data[train_ind, ]
validation <- training_data[-train_ind, ]

In [14]:
sprintf("Validation is %s of train set. Presence points is %s, psuedo-absense/background is %s ", nrow(validation)/(nrow(train)+nrow(validation)),nrow(validation[validation$presence=="1",])/nrow(validation), nrow(validation[validation$presence=="0",])/nrow(validation))

###  Hyperparam Search

In [None]:
# Create search grid
search_grid <- expand.grid(classes=c("default","l","p","h","t","lq"),regmult=seq(0.5, 10, by=1.5))
nrow(search_grid)

best_auc = 0
# Loop through search grid and find best model and params
for(row in 1:nrow(search_grid)){
  tic()
    
  # Get vars
  regmult <- search_grid$regmult[row]
  classes <- search_grid$classes[row]
    
  cat(sprintf("Running config %s out of %s - reg mult:%s classes:%s \n", row,nrow(search_grid), regmult,classes))
  result = train_test_evaluate(train,validation,classes,regmult,plot_diagram=FALSE)
  print(result@auc,result@kappa)
  if(best_auc < result@auc){
   best_auc <- result@auc
   best_model <- maxent_model
   best_result <- result
   best_class = classes
   best_regmult = regmult
   print("New best model")
  }
  
  flush.console()
  toc()
}

Use best params found on test set

In [None]:
train_test_evaluate(training_data,test_data,best_class,best_regmult,plot_diagram=TRUE)