# Pseudo-Absence Generation

In this notebook pseudo-absence data is generated using the techniques described in this [paper](https://www.sciencedirect.com/science/article/abs/pii/S030438001500215X)


In [1]:
# install packages

# install.packages('comprehenr')
# install.packages("raster", dependencies=TRUE)
# install.packages('stringr')
# install.packages('ncdf4')
# install.packages('dplyr')

In [1]:
library(raster)
library(ncdf4)
library(rgdal)
library(comprehenr)
library(glue)
library(stringr)
library(mopa)
library(dplyr)
library(parallel)
library(MASS)

Loading required package: sp

Please note that rgdal will be retired by the end of 2023,
plan transition to sf/stars/terra functions using GDAL and PROJ
at your earliest convenience.

rgdal: version: 1.5-27, (SVN revision 1148)
Geospatial Data Abstraction Library extensions to R successfully loaded
Loaded GDAL runtime: GDAL 2.4.0, released 2018/12/14
Path to GDAL shared files: /usr/share/gdal
GDAL binary built with GEOS: TRUE 
Loaded PROJ runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
Path to PROJ shared files: (autodetected)
Linking to sp version:1.4-5


Attaching package: ‘glue’


The following object is masked from ‘package:raster’:

    trim



Attaching package: ‘dplyr’


The following object is masked from ‘package:glue’:

    collapse


The following objects are masked from ‘package:raster’:

    intersect, select, union


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, 

In [2]:
env_profilers <- list(
#     'Swnet_tavg', 
#     'Lwnet_tavg', 
#     'Qle_tavg', 
#     'Qh_tavg', 
#     'Qg_tavg', 
#     'Snowf_tavg', 
#     'Rainf_tavg', 
#     'Evap_tavg', 
#     'Qs_acc', 
#     'Qsb_acc', 
#     'Qsm_acc', 
#     'AvgSurfT_inst', 
#     'Albedo_inst', 
#     'SWE_inst', 
#     'SnowDepth_inst', 
    'SoilMoi0_10cm_inst'
#     'SoilMoi10_40cm_inst', 
#     'SoilTMP0_10cm_inst', 
#     'SoilTMP10_40cm_inst' 
#     'SoilTMP40_100cm_inst', 
#     'SoilTMP100_200cm_inst', 
#     'PotEvap_tavg', 
#     'ECanop_tavg', 
#     'Tveg_tavg', 
#     'ESoil_tavg', 
#     'RootMoist_inst', 
#     'CanopInt_inst', 
#     'Wind_f_inst', 
#     'Rainf_f_tavg', 
#     'Tair_f_inst',
#     'Qair_f_inst', 
#     'Psurf_f_inst'
#     'SWdown_f_tavg', 
#     'LWdown_f_tavg'
)

In [3]:
length(env_profilers)

In [4]:
get_countrySPDF <- function(country)
{
    if (typeof(country) == 'list'){
        countrySPDF <- getData("GADM",country=country[[1]],level=0) 
        for (country_ in tail(country, -1)){
            countrySPDF <- raster::bind(countrySPDF, getData("GADM",country=country_[[1]],level=0))
        }
    } else if (typeof(country) == 'character'){
        countrySPDF <- getData("GADM",country=country,level=0)
    }
    return(countrySPDF)
}


cropCountry <- function(baseRaster, country)
{
    countrySPDF <- get_countrySPDF(country)
    croppedRaster <- mask(crop(baseRaster, extent(countrySPDF)), countrySPDF)
    return(croppedRaster)
}


aggregateSM_NASA <- function(year, month, basePath, env_profilers){
    base_name <- glue("{basePath}/GLDAS_NOAH025_3H.A{year}{str_pad(month, width=2, pad='0')}")
    dekad1 <- unlist(to_list(for (day in 1:10) Sys.glob(glue("{base_name}{sprintf('%02d', day)}*.nc4"))))
    dekad2 <- unlist(to_list(for (day in 11:20) Sys.glob(glue("{base_name}{sprintf('%02d', day)}*.nc4"))))
    dekad3 <- unlist(to_list(for (day in 21:31) Sys.glob(glue("{base_name}{sprintf('%02d', day)}*.nc4"))))
    
    dekad1_stacks <- list()
    dekad2_stacks <- list()
    dekad3_stacks <- list()
    i <- 1
    for (env_variable in env_profilers){
        dekad1_stacks[[i]] <- calc(stack(dekad1, varname=env_variable), mean)
        dekad2_stacks[[i]] <- calc(stack(dekad2, varname=env_variable), mean)
        dekad3_stacks[[i]] <- calc(stack(dekad3, varname=env_variable), mean)
        i <- i + 1
    }
    dekad1_stacks <- stack(dekad1_stacks)
    names(dekad1_stacks) <- paste0(env_profilers, "_dekad1")
    dekad2_stacks <- stack(dekad2_stacks)
    names(dekad2_stacks) <- paste0(env_profilers, "_dekad2")
    dekad3_stacks <- stack(dekad3_stacks)
    names(dekad3_stacks) <- paste0(env_profilers, "_dekad3")
    
    sm <- stack(dekad1_stacks, dekad2_stacks, dekad3_stacks)
    return(sm)
}
                          
subsetCountry <- function(country, data){
    countrySPDF <- get_countrySPDF(country)
    locs <- data.frame(x=data$X, y=data$Y)
    coordinates(locs) <- c("x","y")
    projection(locs) <- CRS("+proj=longlat +init=epsg:4326")
    projection(countrySPDF) <- CRS("+proj=longlat +init=epsg:4326")
    countryID <- over(locs, countrySPDF)
    data_subset <- data
    data_subset['country'] <- countryID$NAME_0
    
    if (typeof(country) == 'list'){
        data_subset <- data_subset[data_subset$country %in% country,]
    } else if (typeof(country) == 'character'){
        data_subset <- data_subset[data_subset$country == country,]
    }
    
    return(data_subset)
}

finalize_data2 <- function(pa_generation, data){
    final_data <- data.frame(x=pa_generation$x, y=pa_generation$y, presence=pa_generation$v, method=pa_generation$method)
    final_data['year'] = data$year
    final_data['month'] = data$month
    final_data['day'] = 0    
    final_data[final_data$presence != 0, 'day'] <- data$day
    
    if (length(final_data[final_data$presence == 0, 'day']) > 0){
        random_days <- sampleInt(28, length(final_data[final_data$presence == 0, 'day']), replace=TRUE)
        final_data[final_data$presence == 0, 'day'] <- random_days

    }
    return(final_data)
}
                             
extract_values <- function(locs, raster_data){
    sp <- SpatialPoints(locs)
    data <- extract(raster_data, sp, method='bilinear')
    return(data)
}
                             
clean_chunks <- function(datachunks){
    data <- list()
    i <- 1
    
    for (chunk in datachunks){
        if (class(chunk) == 'data.frame'){
            data[[i]] <- chunk
            i <- i+1
        }
    }
    return(data)
}

In [5]:
get_soil_profile <- function(basePath) {
    clay0_5 <- raster(glue('{basePath}/clay_0-5cm_mean.tif'))
    clay5_15 <- raster(glue('{basePath}/clay_5-15cm_mean.tif'))
    sand0_5 <- raster(glue('{basePath}/sand_0-5cm_mean.tif'))
    sand5_15 <- raster(glue('{basePath}/sand_5-15cm_mean.tif'))
    silt0_5 <- raster(glue('{basePath}/silt_0-5cm_mean.tif'))
    silt5_15 <- raster(glue('{basePath}/silt_5-15cm_mean.tif'))
    soil_profile <- stack(c(clay0_5, clay5_15, sand0_5, sand5_15, silt0_5, silt5_15))
    return(soil_profile)
}

In [6]:
dataset <- read.csv(file = 'Hoppers.csv')
date <- as.POSIXct(dataset[['STARTDATE']], format = "%Y/%m/%d %H:%M:%S")
dataset[['yearmonth']] <- format(date, format="%Y%m")
dataset[['year']] <- as.numeric(format(date, format="%Y"))
dataset[['month']] <- as.numeric(format(date, format="%m"))
dataset[['day']] <- as.numeric(format(date, format="%d"))
head(dataset)

Unnamed: 0_level_0,X,Y,OBJECTID,STARTDATE,TmSTARTDAT,FINISHDATE,TmFINISHDA,EXACTDATE,PARTMONTH,LOCNAME,⋯,CTLAPPVEHI,CTLAPPAIR,CTLAPPMECH,CTLAPPUNK,CTLESTKILL,CAT,yearmonth,year,month,day
Unnamed: 0_level_1,<dbl>,<dbl>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,⋯,<int>,<int>,<int>,<int>,<int>,<fct>,<chr>,<dbl>,<dbl>,<dbl>
1,37.333333,19.31667,1,1985/12/30 00:00:00+00,12:00,1985/12/30 00:00:00+00,12:00,No,Late,Khor Hambokeib,⋯,0,0,0,0,0,Hopper,198512,1985,12,30
2,37.35,19.23333,2,1985/12/30 00:00:00+00,12:00,1985/12/30 00:00:00+00,12:00,No,Late,Khor Handub,⋯,0,0,0,0,0,Hopper,198512,1985,12,30
3,-14.917778,16.95167,3,1985/10/18 00:00:00+00,12:00,1985/10/18 00:00:00+00,12:00,No,Middle,,⋯,0,0,0,0,0,Hopper,198510,1985,10,18
4,-0.38165,18.66083,4,1985/10/15 00:00:00+00,12:00,1985/10/15 00:00:00+00,12:00,Yes,,,⋯,0,0,0,0,0,Hopper,198510,1985,10,15
5,1.522778,20.93833,5,1985/11/06 00:00:00+00,12:00,1985/11/06 00:00:00+00,12:00,No,Early,,⋯,0,0,0,0,0,Hopper,198511,1985,11,6
6,4.216111,18.635,6,1985/11/15 00:00:00+00,12:00,1985/11/15 00:00:00+00,12:00,Yes,,,⋯,0,0,0,0,0,Hopper,198511,1985,11,15


In [7]:
train_val <- dataset[((dataset$year >= 2000) & (dataset$year < 2015)),]
test <- dataset[(dataset$year >= 2015), ]

In [8]:
unique(train_val$year)

In [9]:
unique(test$year)

In [10]:
countries = ccodes()
african_countries <- countries[countries$continent == 'Africa',]$NAME
print(african_countries)

 [1] "Algeria"                          "Angola"                          
 [3] "Benin"                            "Botswana"                        
 [5] "Burkina Faso"                     "Burundi"                         
 [7] "Cameroon"                         "Cape Verde"                      
 [9] "Central African Republic"         "Chad"                            
[11] "Comoros"                          "Côte d'Ivoire"                   
[13] "Democratic Republic of the Congo" "Djibouti"                        
[15] "Egypt"                            "Equatorial Guinea"               
[17] "Eritrea"                          "Ethiopia"                        
[19] "French Southern Territories"      "Gabon"                           
[21] "Gambia"                           "Ghana"                           
[23] "Guinea-Bissau"                    "Guinea"                          
[25] "Kenya"                            "Lesotho"                         
[27] "Liberia"           

In [11]:
country <- as.list(african_countries)
soil_profile <- get_soil_profile('isric_soilprofile_datasets')
train_val_subset <- subsetCountry(country, train_val)
test_subset <- subsetCountry(country, test)

In [12]:
unique(train_val_subset$country)

In [13]:
unique(test_subset$country)

In [14]:
nasa_pa_generation <- function(yearmonth){
    print(yearmonth)
    no_generation_limit <- 400 # do generation only when there is at least 400 observations for a yearmonth

    month <- yearmonth %% 100
    year <- yearmonth %/% 100

    success <- FALSE
    tryCatch({
        sm <- cropCountry(aggregateSM_NASA(year, month, basePath, env_profilers), country)
        env_data <- sm 
        success <- TRUE
    }, error = function(e){
        print(paste('Error: ', e))
    })
    if (! success) next

    data_subset <- country_data[(country_data$yearmonth == as.character(yearmonth)),]
    presence <- data_subset[(data_subset$LOCPRESENT == 1),]
    true_absence <- data_subset[(data_subset$LOCPRESENT == 2),]
    geo_locs <- data.frame(x=presence$X, y=presence$Y)
    geo_locs_len <- dim(geo_locs)[1]
    bg <- backgroundGrid(env_data)

    success <- FALSE
    dataset <- list()
    i <- 1
    tryCatch({
        for (method in c('random', "ep_random", 'random+', "ep_random+")){

            if (geo_locs_len >= 1) {
                if (method == 'random'){
                    if (geo_locs_len <= no_generation_limit){
                        random_generated_data <- geo_locs
                        random_generated_data['v'] = 1
                        random_generated_data['method'] = method
                    } else{
                        random_generated_data <-pseudoAbsences(xy = geo_locs, background = bg$xy, 
                                       exclusion.buffer = 0.083*3, 
                                       prevalence = 0.5, kmeans = FALSE)
                        random_generated_data <- data.frame(x=random_generated_data$species1$PA01[[1]]$x, y=random_generated_data$species1$PA01[[1]]$y, v=random_generated_data$species1$PA01[[1]]$v)
                        random_generated_data['method'] = method
                    }
                    dataset[[i]] <- random_generated_data
                    i <- i+1
                }
                
                else if (method == 'random+'){
                    if (geo_locs_len <= no_generation_limit){
                        random_generated_data <- geo_locs
                        random_generated_data['v'] = 1
                        random_generated_data['method'] = method
                    } else{
                        bg_extents <- backgroundRadius(xy = geo_locs, background = bg$xy, 
                               start = 0.083*3, by = 0.083*3, 
                               unit = "decimal degrees")
                        random_plus_generated_data <-pseudoAbsences(xy = geo_locs, background = bg_extents, 
                                       exclusion.buffer = 0.083*3, 
                                       prevalence = 0.5, kmeans = FALSE)
                        random_plus_generated_data <- data.frame(x=random_plus_generated_data$species1$PA01[[1]]$x, y=random_plus_generated_data$species1$PA01[[1]]$y, v=random_plus_generated_data$species1$PA01[[1]]$v)
                        random_plus_generated_data['method'] = method
                    }
                    dataset[[i]] <- random_plus_generated_data
                    i <- i+1
                }
                
                else if (method == "ep_random"){
                    if (geo_locs_len <= no_generation_limit){
                        ep_random_generated_data <- geo_locs
                        ep_random_generated_data['v'] = 1
                        ep_random_generated_data['method'] = method
                    }

                    else {
                        bg_profiled <- OCSVMprofiling(xy = geo_locs, 
                                        varstack = calc(raster::subset(env_data, c('SoilMoi0_10cm_inst_dekad1', 'SoilMoi0_10cm_inst_dekad2', 'SoilMoi0_10cm_inst_dekad3')), mean), 
                                        background = bg$xy)
                        # generate data using environmental profiling
                        ep_random_generated_data <-pseudoAbsences(xy = geo_locs, background = bg_profiled$absence, 
                                           exclusion.buffer = 0.083*3, 
                                           prevalence = 0.5, kmeans = FALSE)
                        ep_random_generated_data <- data.frame(x=ep_random_generated_data$species1$PA01[[1]]$x, y=ep_random_generated_data$species1$PA01[[1]]$y, v=ep_random_generated_data$species1$PA01[[1]]$v)
                        ep_random_generated_data['method'] = method
                    }
                    dataset[[i]] <- ep_random_generated_data
                    i <- i+1
                }
                else if (method == "ep_random+"){
                    if (geo_locs_len <= no_generation_limit){
                        ep_random_generated_data <- geo_locs
                        ep_random_generated_data['v'] = 1
                        ep_random_generated_data['method'] = method
                    }

                    else {
                        bg_profiled <- OCSVMprofiling(xy = geo_locs, 
                                        varstack = calc(raster::subset(env_data, c('SoilMoi0_10cm_inst_dekad1', 'SoilMoi0_10cm_inst_dekad2', 'SoilMoi0_10cm_inst_dekad3')), mean), 
                                        background = bg$xy)
                        bg_extents <- backgroundRadius(xy = geo_locs, background = bg_profiled$absence, 
                               start = 0.083*3, by = 0.083*3, 
                               unit = "decimal degrees")
                        # generate data using environmental profiling
                        ep_random_plus_generated_data <-pseudoAbsences(xy = geo_locs, background = bg_extents, 
                                           exclusion.buffer = 0.083*3, 
                                           prevalence = 0.5, kmeans = FALSE)
                        ep_random_plus_generated_data <- data.frame(x=ep_random_plus_generated_data$species1$PA01[[1]]$x, y=ep_random_plus_generated_data$species1$PA01[[1]]$y, v=ep_random_plus_generated_data$species1$PA01[[1]]$v)
                        ep_random_plus_generated_data['method'] = method
                    }
                    dataset[[i]] <- ep_random_plus_generated_data
                    i <- i+1
                }
            }  
            success <- TRUE
        }
    }, error = function(e){
        print(paste('Error: ', e))

    })
    generated_data <- dplyr::bind_rows(clean_chunks(dataset))
    if (success){
        generated_data <- finalize_data2(generated_data, presence)
        true_absence_geo_locs <- data.frame(x=true_absence$X, y=true_absence$Y)
        if (dim(true_absence_geo_locs)[1] > 0){
            true_absence_geo_locs['v'] = 2
            true_absence_geo_locs['method'] = 'true_absence'
            true_absence <- finalize_data2(true_absence_geo_locs, true_absence)
            final_data <- dplyr::bind_rows(generated_data, true_absence)
        } else {
            final_data <- generated_data
        }
        raster_data <- extract_values(data.frame(x=final_data$x, y=final_data$y), env_data)
        final_data <- cbind(final_data, raster_data)
        pf <- extract(soil_profile, data.frame(x=final_data$x, y=final_data$y))
        final_data <- dplyr::bind_cols(final_data, data.frame(pf))
        return(final_data)
    }
}

In [15]:
basePath <- '/mnt/disks/nasa/NASA'

### Generate All Data

In [16]:
as.numeric(unique(test_subset$yearmonth))

In [17]:
as.numeric(unique(train_val_subset$yearmonth))

In [18]:
table(train_val_subset$yearmonth)


200001 200002 200003 200004 200007 200008 200009 200010 200011 200012 200101 
    17     14     42      8     13     13     21    114    233     15     13 
200102 200105 200106 200107 200108 200109 200110 200111 200112 200201 200202 
    12      1      4      2      3      2     93     29     11      2      2 
200206 200207 200208 200209 200210 200211 200212 200301 200302 200303 200304 
     9      3      1     24     32     74     19      3      9      7     10 
200305 200306 200307 200308 200309 200310 200311 200312 200401 200402 200403 
     2      7      7     66    187    210    469    594    572    695    457 
200404 200405 200406 200407 200408 200409 200410 200411 200412 200501 200502 
  1251   2067   1305    165    176    542    545    368    124     24     32 
200503 200504 200505 200506 200507 200508 200509 200510 200511 200512 200601 
    18     92    114     37     34     16     39    178    444    299    114 
200602 200603 200604 200605 200606 200607 200608 200609 200610 

### Test with a single yearmonth

In [19]:
country_data <- train_val_subset
sample <- nasa_pa_generation(200311)

[1] 200311


[2021-11-26 09:03:01] Generating pseudo-absences for species 1

:::[2021-11-26 09:03:01] Realization 1

[2021-11-26 09:03:05] Generating pseudo-absences for species 1

:::[2021-11-26 09:03:05] Realization 1



[1] "creating background point-grids for species 1 out of 1"


[2021-11-26 09:03:20] Generating pseudo-absences for species 1

:::[2021-11-26 09:03:20] Realization 1

Background km30 is too small for sampling and will be ignored



[1] "creating background point-grids for species 1 out of 1"


[2021-11-26 09:03:36] Generating pseudo-absences for species 1

:::[2021-11-26 09:03:36] Realization 1

Background km30 is too small for sampling and will be ignored

Background km60 is too small for sampling and will be ignored



In [20]:
sample

x,y,presence,method,year,month,day,SoilMoi0_10cm_inst_dekad1,SoilMoi0_10cm_inst_dekad2,SoilMoi0_10cm_inst_dekad3,clay_0.5cm_mean,clay_5.15cm_mean,sand_0.5cm_mean,sand_5.15cm_mean,silt_0.5cm_mean,silt_5.15cm_mean
<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
-12.7986111,18.53167,1,random,2003,11,1,8.952612,7.770489,7.451513,251,257,561,554,188,188
3.0352778,18.31944,1,random,2003,11,27,10.163652,9.715378,9.678137,228,239,559,550,213,212
3.1372222,18.15139,1,random,2003,11,28,10.239193,9.764887,9.660942,229,242,541,520,230,238
3.0925000,18.24028,1,random,2003,11,28,10.211503,9.747480,9.675087,231,240,563,553,207,207
-13.2069444,18.40611,1,random,2003,11,22,8.932872,7.770216,7.582310,231,245,578,563,191,193
-13.1730556,18.43667,1,random,2003,11,22,8.938970,7.777158,7.554843,261,280,540,524,199,196
-13.3230556,18.38778,1,random,2003,11,22,8.918937,7.750374,7.570520,213,219,585,583,201,198
-12.8500000,18.48139,1,random,2003,11,8,8.965901,7.792199,7.528761,249,250,550,543,201,207
-12.8391667,18.42028,1,random,2003,11,7,8.962402,7.791485,7.602509,241,249,559,547,200,203
-15.3494444,19.24917,1,random,2003,11,8,15.215944,12.305619,10.736181,188,195,573,569,239,235


In [21]:
unique(sample$method)

In [22]:
sample[sample$method == 'random',]

Unnamed: 0_level_0,x,y,presence,method,year,month,day,SoilMoi0_10cm_inst_dekad1,SoilMoi0_10cm_inst_dekad2,SoilMoi0_10cm_inst_dekad3,clay_0.5cm_mean,clay_5.15cm_mean,sand_0.5cm_mean,sand_5.15cm_mean,silt_0.5cm_mean,silt_5.15cm_mean
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,-12.7986111,18.53167,1,random,2003,11,1,8.952612,7.770489,7.451513,251,257,561,554,188,188
2,3.0352778,18.31944,1,random,2003,11,27,10.163652,9.715378,9.678137,228,239,559,550,213,212
3,3.1372222,18.15139,1,random,2003,11,28,10.239193,9.764887,9.660942,229,242,541,520,230,238
4,3.0925000,18.24028,1,random,2003,11,28,10.211503,9.747480,9.675087,231,240,563,553,207,207
5,-13.2069444,18.40611,1,random,2003,11,22,8.932872,7.770216,7.582310,231,245,578,563,191,193
6,-13.1730556,18.43667,1,random,2003,11,22,8.938970,7.777158,7.554843,261,280,540,524,199,196
7,-13.3230556,18.38778,1,random,2003,11,22,8.918937,7.750374,7.570520,213,219,585,583,201,198
8,-12.8500000,18.48139,1,random,2003,11,8,8.965901,7.792199,7.528761,249,250,550,543,201,207
9,-12.8391667,18.42028,1,random,2003,11,7,8.962402,7.791485,7.602509,241,249,559,547,200,203
10,-15.3494444,19.24917,1,random,2003,11,8,15.215944,12.305619,10.736181,188,195,573,569,239,235


In [23]:
sample[sample$method == 'random+',]

Unnamed: 0_level_0,x,y,presence,method,year,month,day,SoilMoi0_10cm_inst_dekad1,SoilMoi0_10cm_inst_dekad2,SoilMoi0_10cm_inst_dekad3,clay_0.5cm_mean,clay_5.15cm_mean,sand_0.5cm_mean,sand_5.15cm_mean,silt_0.5cm_mean,silt_5.15cm_mean
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1877,-12.7986111,18.53167,1,random+,2003,11,1,8.952612,7.770489,7.451513,251,257,561,554,188,188
1878,3.0352778,18.31944,1,random+,2003,11,27,10.163652,9.715378,9.678137,228,239,559,550,213,212
1879,3.1372222,18.15139,1,random+,2003,11,28,10.239193,9.764887,9.660942,229,242,541,520,230,238
1880,3.0925000,18.24028,1,random+,2003,11,28,10.211503,9.747480,9.675087,231,240,563,553,207,207
1881,-13.2069444,18.40611,1,random+,2003,11,22,8.932872,7.770216,7.582310,231,245,578,563,191,193
1882,-13.1730556,18.43667,1,random+,2003,11,22,8.938970,7.777158,7.554843,261,280,540,524,199,196
1883,-13.3230556,18.38778,1,random+,2003,11,22,8.918937,7.750374,7.570520,213,219,585,583,201,198
1884,-12.8500000,18.48139,1,random+,2003,11,8,8.965901,7.792199,7.528761,249,250,550,543,201,207
1885,-12.8391667,18.42028,1,random+,2003,11,7,8.962402,7.791485,7.602509,241,249,559,547,200,203
1886,-15.3494444,19.24917,1,random+,2003,11,8,15.215944,12.305619,10.736181,188,195,573,569,239,235


In [24]:
sample[sample$method == 'ep_random',]

Unnamed: 0_level_0,x,y,presence,method,year,month,day,SoilMoi0_10cm_inst_dekad1,SoilMoi0_10cm_inst_dekad2,SoilMoi0_10cm_inst_dekad3,clay_0.5cm_mean,clay_5.15cm_mean,sand_0.5cm_mean,sand_5.15cm_mean,silt_0.5cm_mean,silt_5.15cm_mean
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
939,-12.7986111,18.53167,1,ep_random,2003,11,1,8.952612,7.770489,7.451513,251,257,561,554,188,188
940,3.0352778,18.31944,1,ep_random,2003,11,27,10.163652,9.715378,9.678137,228,239,559,550,213,212
941,3.1372222,18.15139,1,ep_random,2003,11,28,10.239193,9.764887,9.660942,229,242,541,520,230,238
942,3.0925000,18.24028,1,ep_random,2003,11,28,10.211503,9.747480,9.675087,231,240,563,553,207,207
943,-13.2069444,18.40611,1,ep_random,2003,11,22,8.932872,7.770216,7.582310,231,245,578,563,191,193
944,-13.1730556,18.43667,1,ep_random,2003,11,22,8.938970,7.777158,7.554843,261,280,540,524,199,196
945,-13.3230556,18.38778,1,ep_random,2003,11,22,8.918937,7.750374,7.570520,213,219,585,583,201,198
946,-12.8500000,18.48139,1,ep_random,2003,11,8,8.965901,7.792199,7.528761,249,250,550,543,201,207
947,-12.8391667,18.42028,1,ep_random,2003,11,7,8.962402,7.791485,7.602509,241,249,559,547,200,203
948,-15.3494444,19.24917,1,ep_random,2003,11,8,15.215944,12.305619,10.736181,188,195,573,569,239,235


In [25]:
sample[sample$method == 'ep_random+',]

Unnamed: 0_level_0,x,y,presence,method,year,month,day,SoilMoi0_10cm_inst_dekad1,SoilMoi0_10cm_inst_dekad2,SoilMoi0_10cm_inst_dekad3,clay_0.5cm_mean,clay_5.15cm_mean,sand_0.5cm_mean,sand_5.15cm_mean,silt_0.5cm_mean,silt_5.15cm_mean
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2815,-12.7986111,18.53167,1,ep_random+,2003,11,1,8.952612,7.770489,7.451513,251,257,561,554,188,188
2816,3.0352778,18.31944,1,ep_random+,2003,11,27,10.163652,9.715378,9.678137,228,239,559,550,213,212
2817,3.1372222,18.15139,1,ep_random+,2003,11,28,10.239193,9.764887,9.660942,229,242,541,520,230,238
2818,3.0925000,18.24028,1,ep_random+,2003,11,28,10.211503,9.747480,9.675087,231,240,563,553,207,207
2819,-13.2069444,18.40611,1,ep_random+,2003,11,22,8.932872,7.770216,7.582310,231,245,578,563,191,193
2820,-13.1730556,18.43667,1,ep_random+,2003,11,22,8.938970,7.777158,7.554843,261,280,540,524,199,196
2821,-13.3230556,18.38778,1,ep_random+,2003,11,22,8.918937,7.750374,7.570520,213,219,585,583,201,198
2822,-12.8500000,18.48139,1,ep_random+,2003,11,8,8.965901,7.792199,7.528761,249,250,550,543,201,207
2823,-12.8391667,18.42028,1,ep_random+,2003,11,7,8.962402,7.791485,7.602509,241,249,559,547,200,203
2824,-15.3494444,19.24917,1,ep_random+,2003,11,8,15.215944,12.305619,10.736181,188,195,573,569,239,235


### Generate for test

In [24]:
country_data <- test_subset
yearmonths <- as.numeric(unique(test_subset$yearmonth))

system.time(
  dataframe_chunks1 <- mclapply(yearmonths, nasa_pa_generation2, mc.cores = 7)
)

test_gen <- dplyr::bind_rows(clean_chunks(dataframe_chunks1))

“scheduled cores 3, 4 encountered errors in user code, all values of the jobs will be affected”


    user   system  elapsed 
6204.790 2109.601 1498.163 

In [25]:
write.csv(test_gen,'v4_2/test_gen_v4_2.csv')

### Generate for train

In [26]:
country_data <- train_val_subset
yearmonths <- as.numeric(unique(train_val_subset$yearmonth))

system.time(
  dataframe_chunks2 <- mclapply(yearmonths, nasa_pa_generation2, mc.cores = 7)
)
train_val_gen <- dplyr::bind_rows(clean_chunks(dataframe_chunks2))

     user    system   elapsed 
15965.411  6760.839  3981.705 

In [27]:
write.csv(train_val_gen,'v4_2/train_val_gen_v4_2.csv')