# Pseudo-Absence Generation

In this notebook pseudo-absence data is generated using the techniques described in this [paper](https://www.sciencedirect.com/science/article/abs/pii/S030438001500215X)


In [None]:
# install packages

# install.packages('comprehenr')
# install.packages("raster", dependencies=TRUE)
# install.packages('stringr')
# install.packages('ncdf4')
# install.packages('dplyr')

In [None]:
library(raster)
library(ncdf4)
library(rgdal)
library(comprehenr)
library(glue)
library(stringr)
library(mopa)
library(dplyr)
library(parallel)
library(MASS)

In [None]:
env_profilers <- list(
    'SoilMoi0_10cm_inst'
)

In [None]:
get_countrySPDF <- function(country)
{
    if (typeof(country) == 'list'){
        countrySPDF <- getData("GADM",country=country[[1]],level=0) 
        for (country_ in tail(country, -1)){
            countrySPDF <- raster::bind(countrySPDF, getData("GADM",country=country_[[1]],level=0))
        }
    } else if (typeof(country) == 'character'){
        countrySPDF <- getData("GADM",country=country,level=0)
    }
    return(countrySPDF)
}


cropCountry <- function(baseRaster, country)
{
    countrySPDF <- get_countrySPDF(country)
    croppedRaster <- mask(crop(baseRaster, extent(countrySPDF)), countrySPDF)
    return(croppedRaster)
}


aggregateSM_NASA <- function(year, month, basePath, env_profilers){
    base_name <- glue("{basePath}/GLDAS_NOAH025_3H.A{year}{str_pad(month, width=2, pad='0')}")
    dekad1 <- unlist(to_list(for (day in 1:10) Sys.glob(glue("{base_name}{sprintf('%02d', day)}*.nc4"))))
    dekad2 <- unlist(to_list(for (day in 11:20) Sys.glob(glue("{base_name}{sprintf('%02d', day)}*.nc4"))))
    dekad3 <- unlist(to_list(for (day in 21:31) Sys.glob(glue("{base_name}{sprintf('%02d', day)}*.nc4"))))
    
    dekad1_stacks <- list()
    dekad2_stacks <- list()
    dekad3_stacks <- list()
    i <- 1
    for (env_variable in env_profilers){
        dekad1_stacks[[i]] <- calc(stack(dekad1, varname=env_variable), mean)
        dekad2_stacks[[i]] <- calc(stack(dekad2, varname=env_variable), mean)
        dekad3_stacks[[i]] <- calc(stack(dekad3, varname=env_variable), mean)
        i <- i + 1
    }
    dekad1_stacks <- stack(dekad1_stacks)
    names(dekad1_stacks) <- paste0(env_profilers, "_dekad1")
    dekad2_stacks <- stack(dekad2_stacks)
    names(dekad2_stacks) <- paste0(env_profilers, "_dekad2")
    dekad3_stacks <- stack(dekad3_stacks)
    names(dekad3_stacks) <- paste0(env_profilers, "_dekad3")
    
    sm <- stack(dekad1_stacks, dekad2_stacks, dekad3_stacks)
    return(sm)
}
                          
subsetCountry <- function(country, data){
    countrySPDF <- get_countrySPDF(country)
    locs <- data.frame(x=data$X, y=data$Y)
    coordinates(locs) <- c("x","y")
    projection(locs) <- CRS("+proj=longlat +init=epsg:4326")
    projection(countrySPDF) <- CRS("+proj=longlat +init=epsg:4326")
    countryID <- over(locs, countrySPDF)
    data_subset <- data
    data_subset['country'] <- countryID$NAME_0
    
    if (typeof(country) == 'list'){
        data_subset <- data_subset[data_subset$country %in% country,]
    } else if (typeof(country) == 'character'){
        data_subset <- data_subset[data_subset$country == country,]
    }
    
    return(data_subset)
}

finalize_data2 <- function(pa_generation, data){
    final_data <- data.frame(x=pa_generation$x, y=pa_generation$y, presence=pa_generation$v, method=pa_generation$method)
    final_data['year'] = data$year
    final_data['month'] = data$month
    final_data['day'] = 0    
    final_data[final_data$presence != 0, 'day'] <- data$day
    
    if (length(final_data[final_data$presence == 0, 'day']) > 0){
        random_days <- sampleInt(28, length(final_data[final_data$presence == 0, 'day']), replace=TRUE)
        final_data[final_data$presence == 0, 'day'] <- random_days

    }
    return(final_data)
}
                             
extract_values <- function(locs, raster_data){
    sp <- SpatialPoints(locs)
    data <- extract(raster_data, sp, method='bilinear')
    return(data)
}
                             
clean_chunks <- function(datachunks){
    data <- list()
    i <- 1
    
    for (chunk in datachunks){
        if (class(chunk) == 'data.frame'){
            data[[i]] <- chunk
            i <- i+1
        }
    }
    return(data)
}

In [None]:
get_soil_profile <- function(basePath) {
    clay0_5 <- raster(glue('{basePath}/clay_0-5cm_mean.tif'))
    clay5_15 <- raster(glue('{basePath}/clay_5-15cm_mean.tif'))
    sand0_5 <- raster(glue('{basePath}/sand_0-5cm_mean.tif'))
    sand5_15 <- raster(glue('{basePath}/sand_5-15cm_mean.tif'))
    silt0_5 <- raster(glue('{basePath}/silt_0-5cm_mean.tif'))
    silt5_15 <- raster(glue('{basePath}/silt_5-15cm_mean.tif'))
    soil_profile <- stack(c(clay0_5, clay5_15, sand0_5, sand5_15, silt0_5, silt5_15))
    return(soil_profile)
}

In [None]:
dataset <- read.csv(file = 'Hoppers.csv')
date <- as.POSIXct(dataset[['STARTDATE']], format = "%Y/%m/%d %H:%M:%S")
dataset[['yearmonth']] <- format(date, format="%Y%m")
dataset[['year']] <- as.numeric(format(date, format="%Y"))
dataset[['month']] <- as.numeric(format(date, format="%m"))
dataset[['day']] <- as.numeric(format(date, format="%d"))
head(dataset)

In [None]:
train_val <- dataset[((dataset$year >= 2000) & (dataset$year < 2015)),]
test <- dataset[(dataset$year >= 2015), ]

In [None]:
unique(train_val$year)

In [None]:
unique(test$year)

In [None]:
countries = ccodes()
african_countries <- countries[countries$continent == 'Africa',]$NAME
print(african_countries)

In [None]:
country <- as.list(african_countries)
soil_profile <- get_soil_profile('isric_soilprofile_datasets')
train_val_subset <- subsetCountry(country, train_val)
test_subset <- subsetCountry(country, test)

In [None]:
unique(train_val_subset$country)

In [None]:
unique(test_subset$country)

In [None]:
nasa_pa_generation <- function(yearmonth){
    print(yearmonth)
    no_generation_limit <- 400 # do generation only when there is at least 400 observations for a yearmonth

    month <- yearmonth %% 100
    year <- yearmonth %/% 100

    success <- FALSE
    tryCatch({
        sm <- cropCountry(aggregateSM_NASA(year, month, basePath, env_profilers), country)
        env_data <- sm 
        success <- TRUE
    }, error = function(e){
        print(paste('Error: ', e))
    })
    if (! success) next

    data_subset <- country_data[(country_data$yearmonth == as.character(yearmonth)),]
    presence <- data_subset[(data_subset$LOCPRESENT == 1),]
    true_absence <- data_subset[(data_subset$LOCPRESENT == 2),]
    geo_locs <- data.frame(x=presence$X, y=presence$Y)
    geo_locs_len <- dim(geo_locs)[1]
    bg <- backgroundGrid(env_data)

    success <- FALSE
    dataset <- list()
    i <- 1
    tryCatch({
        for (method in c('random', "ep_random", 'random+', "ep_random+")){

            if (geo_locs_len >= 1) {
                if (method == 'random'){
                    if (geo_locs_len <= no_generation_limit){
                        random_generated_data <- geo_locs
                        random_generated_data['v'] = 1
                        random_generated_data['method'] = method
                    } else{
                        random_generated_data <-pseudoAbsences(xy = geo_locs, background = bg$xy, 
                                       exclusion.buffer = 0.083*3, 
                                       prevalence = 0.5, kmeans = FALSE)
                        random_generated_data <- data.frame(x=random_generated_data$species1$PA01[[1]]$x, y=random_generated_data$species1$PA01[[1]]$y, v=random_generated_data$species1$PA01[[1]]$v)
                        random_generated_data['method'] = method
                    }
                    dataset[[i]] <- random_generated_data
                    i <- i+1
                }
                
                else if (method == 'random+'){
                    if (geo_locs_len <= no_generation_limit){
                        random_generated_data <- geo_locs
                        random_generated_data['v'] = 1
                        random_generated_data['method'] = method
                    } else{
                        bg_extents <- backgroundRadius(xy = geo_locs, background = bg$xy, 
                               start = 0.083*3, by = 0.083*3, 
                               unit = "decimal degrees")
                        random_plus_generated_data <-pseudoAbsences(xy = geo_locs, background = bg_extents, 
                                       exclusion.buffer = 0.083*3, 
                                       prevalence = 0.5, kmeans = FALSE)
                        random_plus_generated_data <- data.frame(x=random_plus_generated_data$species1$PA01[[1]]$x, y=random_plus_generated_data$species1$PA01[[1]]$y, v=random_plus_generated_data$species1$PA01[[1]]$v)
                        random_plus_generated_data['method'] = method
                    }
                    dataset[[i]] <- random_plus_generated_data
                    i <- i+1
                }
                
                else if (method == "ep_random"){
                    if (geo_locs_len <= no_generation_limit){
                        ep_random_generated_data <- geo_locs
                        ep_random_generated_data['v'] = 1
                        ep_random_generated_data['method'] = method
                    }

                    else {
                        bg_profiled <- OCSVMprofiling(xy = geo_locs, 
                                        varstack = calc(raster::subset(env_data, c('SoilMoi0_10cm_inst_dekad1', 'SoilMoi0_10cm_inst_dekad2', 'SoilMoi0_10cm_inst_dekad3')), mean), 
                                        background = bg$xy)
                        # generate data using environmental profiling
                        ep_random_generated_data <-pseudoAbsences(xy = geo_locs, background = bg_profiled$absence, 
                                           exclusion.buffer = 0.083*3, 
                                           prevalence = 0.5, kmeans = FALSE)
                        ep_random_generated_data <- data.frame(x=ep_random_generated_data$species1$PA01[[1]]$x, y=ep_random_generated_data$species1$PA01[[1]]$y, v=ep_random_generated_data$species1$PA01[[1]]$v)
                        ep_random_generated_data['method'] = method
                    }
                    dataset[[i]] <- ep_random_generated_data
                    i <- i+1
                }
                else if (method == "ep_random+"){
                    if (geo_locs_len <= no_generation_limit){
                        ep_random_generated_data <- geo_locs
                        ep_random_generated_data['v'] = 1
                        ep_random_generated_data['method'] = method
                    }

                    else {
                        bg_profiled <- OCSVMprofiling(xy = geo_locs, 
                                        varstack = calc(raster::subset(env_data, c('SoilMoi0_10cm_inst_dekad1', 'SoilMoi0_10cm_inst_dekad2', 'SoilMoi0_10cm_inst_dekad3')), mean), 
                                        background = bg$xy)
                        bg_extents <- backgroundRadius(xy = geo_locs, background = bg_profiled$absence, 
                               start = 0.083*3, by = 0.083*3, 
                               unit = "decimal degrees")
                        # generate data using environmental profiling
                        ep_random_plus_generated_data <-pseudoAbsences(xy = geo_locs, background = bg_extents, 
                                           exclusion.buffer = 0.083*3, 
                                           prevalence = 0.5, kmeans = FALSE)
                        ep_random_plus_generated_data <- data.frame(x=ep_random_plus_generated_data$species1$PA01[[1]]$x, y=ep_random_plus_generated_data$species1$PA01[[1]]$y, v=ep_random_plus_generated_data$species1$PA01[[1]]$v)
                        ep_random_plus_generated_data['method'] = method
                    }
                    dataset[[i]] <- ep_random_plus_generated_data
                    i <- i+1
                }
            }  
            success <- TRUE
        }
    }, error = function(e){
        print(paste('Error: ', e))

    })
    generated_data <- dplyr::bind_rows(clean_chunks(dataset))
    if (success){
        generated_data <- finalize_data2(generated_data, presence)
        true_absence_geo_locs <- data.frame(x=true_absence$X, y=true_absence$Y)
        if (dim(true_absence_geo_locs)[1] > 0){
            true_absence_geo_locs['v'] = 2
            true_absence_geo_locs['method'] = 'true_absence'
            true_absence <- finalize_data2(true_absence_geo_locs, true_absence)
            final_data <- dplyr::bind_rows(generated_data, true_absence)
        } else {
            final_data <- generated_data
        }
        raster_data <- extract_values(data.frame(x=final_data$x, y=final_data$y), env_data)
        final_data <- cbind(final_data, raster_data)
        pf <- extract(soil_profile, data.frame(x=final_data$x, y=final_data$y))
        final_data <- dplyr::bind_cols(final_data, data.frame(pf))
        return(final_data)
    }
}

In [None]:
# path to NASA NOAH Dataset
basePath <- '/mnt/disks/nasa/NASA'

### Generate All Data

### Test with a single yearmonth

In [None]:
country_data <- train_val_subset
sample <- nasa_pa_generation(200311)

In [None]:
sample

In [None]:
unique(sample$method)

### Generate for test

In [None]:
country_data <- test_subset
yearmonths <- as.numeric(unique(test_subset$yearmonth))

system.time(
  dataframe_chunks1 <- mclapply(yearmonths, nasa_pa_generation2, mc.cores = 7)
)

test_gen <- dplyr::bind_rows(clean_chunks(dataframe_chunks1))

In [None]:
write.csv(test_gen,'test_gen.csv')

### Generate for train

In [None]:
country_data <- train_val_subset
yearmonths <- as.numeric(unique(train_val_subset$yearmonth))

system.time(
  dataframe_chunks2 <- mclapply(yearmonths, nasa_pa_generation2, mc.cores = 7)
)
train_val_gen <- dplyr::bind_rows(clean_chunks(dataframe_chunks2))

In [None]:
write.csv(train_val_gen,'train_val_gen.csv')