# Presence only data modelling
Build presence only models using presence and background data for training; and absence and pseudo-absence for testing. 

## Downloads and imports

In [None]:
# Install if you are are't running through docker
# install.packages(c("dismo","maxnet","tictoc","MLmetrics","dplyr"),repos = "http://cran.us.r-project.org")

In [None]:
library(maxnet)       
library(tictoc)          
library(MLmetrics)
library(dplyr)

## Helper functions

In [None]:
replace_na_with_mean <- function(dataframe) {
    # Replace NA in all columns with mean of column
    for(i in 1:ncol(dataframe)) {                                   
        dataframe[ , i][is.na(dataframe[ , i])] <- mean(dataframe[ , i], na.rm = TRUE)
    }
    return(dataframe)
}

In [None]:
train_test_evaluate <- function(training_data,test_data,classes,regmult,plot_diagram) {
    # Generate and normalize train data
    train_x <- training_data[, -which(names(training_data) == "presence")]


    ## Remove nans
    train_x = replace_na_with_mean(train_x) 
    
    ## Train model
    maxent_model_for_run <- maxnet(training_data$presence, train_x, maxnet.formula(training_data$presence, train_x, classes=classes),regmult=regmult)
    
    # Generate and normalize test data
    ## Ignore real absences for now - we eval on pseudo absences - ignore presence=2
    test_data = test_data[test_data$presence %in% c("0", "1"), ]

    test_x <- test_data[, -which(names(test_data) == "presence")]

    # Remove nans
    test_x = replace_na_with_mean(test_x) 

    # Test model
    test_pred = predict(maxent_model_for_run, test_x, clamp=T, type="cloglog")
    auc <- AUC(y_pred=test_pred, y_true=test_data$presence)
    accuracy <- Accuracy(y_pred=ifelse(test_pred >= .5, 1, 0), y_true=test_data$presence)
    f1 <- F1_Score(y_pred=ifelse(test_pred >= .5, 1, 0), y_true=test_data$presence)
    result_test <- data.frame(accuracy,f1,auc)

    return(result_test)
}

train_test_evaluate_existing_model <- function(maxent_model,test_data,plot_diagram) {
    tic()
    # Generate and normalize test data
    ## Ignore real absences for now - we eval on pseudo absences - ignore presence=2
    test_data = test_data[test_data$presence %in% c("0", "1"), ]

    test_x <- test_data[, -which(names(test_data) == "presence")]

    # Remove nans
    test_x = replace_na_with_mean(test_x) 

    # Test model
    test_pred = predict(maxent_model, test_x, clamp=T, type="cloglog")
    auc <- AUC(y_pred=test_pred, y_true=test_data$presence)
    accuracy <- Accuracy(y_pred=ifelse(test_pred >= .5, 1, 0), y_true=test_data$presence)
    f1 <- F1_Score(y_pred=ifelse(test_pred >= .5, 1, 0), y_true=test_data$presence)
    result_test <- data.frame(accuracy,f1,auc)
    toc()
    return(result_test)
}

In [None]:
# randomly sample from dataframe
sample_from_dataset <- function(dataset,sample_percentage) {
    num_training_samples = nrow(dataset)
    smp_size <- floor(sample_percentage * num_training_samples)
    train_ind <- sample(seq_len(num_training_samples), size = smp_size)
    return(dataset[train_ind, ])
}

# Build models

## Loading data

In [None]:
# Load preprocessed and scaled data
train <- read.csv("../data/presence_only/scaled_train.csv", header=TRUE)
validation <- read.csv("../data/presence_only/scaled_val.csv", header=TRUE)


In [None]:
sprintf("Validation is %s of train set. Presence points is %s, psuedo-absense/background is %s ", nrow(validation)/(nrow(train)+nrow(validation)),nrow(validation[validation$presence=="1",])/nrow(validation), nrow(validation[validation$presence=="0",])/nrow(validation))

## Hyperparam Search (**Optional**)
Use train and validation to find best hyparams

#### Use the train and validation set to find good hyperparams (or a subset of train and validation if this takes too long)

In [None]:
# Only use some of the training data - this takes too long while all the training data
train_sample = sample_from_dataset(train,0.25)
validation_sample = validation

print(nrow(train_sample))
print(ncol(train_sample))

# Create search grid
search_grid <- expand.grid(classes=c("l","lq","lqh","h","t","p"),regmult=seq(0.25, 2, by=0.25))
nrow(search_grid)

best_auc = 0
# Loop through search grid and find best model and params
for(row in 1:nrow(search_grid)){
  tic()
    
  # Get vars
  regmult <- search_grid$regmult[row]
  classes <- search_grid$classes[row]
    
  cat(sprintf("Running config %s out of %s - reg mult:%s classes:%s \n", row,nrow(search_grid), regmult,classes))
  result = train_test_evaluate(train_sample,validation_sample,classes,regmult,plot_diagram=FALSE)
  print(result)
  if(best_auc < result$auc){
   best_result <- result
   best_class <- classes
   best_regmult <- regmult
   best_auc <-  result$auc
   print("New best model")
   print(best_result)
  }
  
  flush.console()
  toc()
}

#### Use best params found on train & val sets, and use those on the test set

In [None]:
print(best_class)
print(best_regmult)

# Run Best Configuration on Test Data slices

Use these params if you are not running hyperparam tuning.

In [None]:
best_class <- "l"
best_regmult <- 1 

In [None]:
tic()
training_data = rbind(validation, train)

training_data_subset_cols <- training_data

all_results <- data.frame(index=integer(),accuracy=double(),f1=double(),
                 auc=double()) 

train_x <- training_data_subset_cols[, -which(names(training_data_subset_cols) == "presence")]

## Remove nans
train_x = replace_na_with_mean(train_x) 

## Train model
maxent_model <- maxnet(training_data_subset_cols$presence, train_x, maxnet.formula(training_data_subset_cols$presence, train_x, classes=best_class),regmult=best_regmult)
toc()

for (i in seq(7, 700, by=7)){
    tic()
    test_filename <- sprintf("../data/test_data/test_scaled_seed_%s.csv", i)
    
    test_data <- read.csv(test_filename, header=TRUE)
    
    test_data <- test_data 
    
    test_result <- train_test_evaluate_existing_model(maxent_model,test_data,plot_diagram=FALSE)
    test_result[, "seed"] <- i
    all_results = rbind(all_results, test_result)
    
    flush.console()
    toc()
}

print(all_results)
results_filename <- sprintf("../results/presence-only/maxent_%s_class_%s_reg.csv", best_class,best_regmult)
write.csv(all_results,results_filename,row.names=FALSE)