# R SWAT Benchmark for to.r.dataframe new function (released in 1.6.4 R SWAT)
# The cas table are generated in public by the script : generate_sized_datasets_for_benchmarks.sas
## Use of PMAP linux function manually for the specific PID related to this R session before and after each key program steps :
### ps -ef | grep r_env
### sudo pmap \<PID\> | grep total

# PMAP = 1035304K

In [1]:
library(swat) 
library(ggplot2)
library(reshape2)
library(dplyr)

SWAT 1.6.3

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
# Function to get RAM used by the R session using pmap linux system function
get.ram.used.by.session.process <- function() {
    Rpid <- Sys.getpid()
    pmapCmd <- paste0("pmap ",Rpid," | grep total | cut -b15-")
    a <- system(pmapCmd, intern = TRUE)
    return(as.numeric(substr(a,2,nchar(a)-1)))
}

In [3]:
casRetrieve_test <-  function(caz, actn, ...) {
  args <- list(...)
  newargs <- list()
  # Remove duplicate keys
  for (i in seq_len(length(args))) {
    name <- names(args[i])
    newargs[[name]] <- args[i][[1]]
  }
  args <- newargs
  if (class(caz)=='CAS'){
    if (is.null(args$`_messageLevel`) && is.null(args$`_messagelevel`)) {
      args$`_messageLevel`=as.character(getOption('cas.message.level'))
    }
    return(do.call(caz$retrieve, c(list(actn), args)))
  }
  if (class(caz) =='CASTable'){
    if (is.null(args$`_messageLevel`) && is.null(args$`_messagelevel`)) {
      args$`_messageLevel`=as.character(getOption('cas.message.level'))
    }
    return(do.call(caz$retrieve, c(list(caz@conn, actn), args)))
  }
}

In [3]:
#' Convert a CAS Table to a R Data Frame (Download)
#'
#' Downloads the in-memory table that is referenced by
#' the CASTable object and stores it as a data.frame
#' in R. This function is used to download datasets from CAS.
#'
#' @param ct The CASTable object to download.
#' @param obs Number of rows to download, by default 32768
#' 
#' @return Returns a data.frame object that contains
#'         a copy of the in-memory data.
#' @export
#' @rawRd % Copyright SAS Institute
#'
#' @examples
#' \dontrun{
#' rdf = to.r.data.frame(CASTable)
#' }
#' 

to.r.data.frame <-  function(ct, obs=32768, commitSize=100000) {
  if (class(ct) != 'CASTable') {
    stop("The first parameter must be a CASTable object")
  }

  tp = gen.table.parm(ct)
  fv = c(tp$vars, tp$computedVars)
  fv = fv[fv != ""]
  if (sum(nchar(ct@XcomputedVars)))
    for (Xcmp in ct@XcomputedVars)
      if (!(Xcmp %in% ct@computedVars))
        fv = fv[fv != Xcmp]

  print(paste0("memory used 1 :",get.ram.used.by.session.process()))
  
        
  if (length(tp$orderby))
    res <- casRetrieve(ct@conn, 'table.fetch', table=tp, fetchVars=fv, index=FALSE, from=1, to=obs, maxRows=commitSize, sortby=tp$orderby)
  else
    res <- casRetrieve(ct@conn, 'table.fetch', table=tp, fetchVars=fv, index=FALSE, from=1, to=obs, maxRows=commitSize)

  print(paste0("memory used 2 :",get.ram.used.by.session.process()))
    
  out <- list()
  for ( i in 1:length(res$results) ) {
    if ( i == 1 ) {
      keyname <- 'Fetch'
    } else {
      keyname <- paste('Fetch', i-1, sep='')
    }
    if ( is.null(res$results[keyname]) ) {
      break
    }
    out[[i]] <- res$results[[keyname]]$df
  }

    print(paste0("memory used 3 :",get.ram.used.by.session.process()))

  class(out)
  out <- do.call('rbind', out)
  rownames(out) <- NULL

  print("Output object size =")
  print(object.size(out))
  print(dim(out))
  print("CASRetrieve Res object size =")
  print(object.size(res))
  print(length(res))
  print(class(res))
  print(paste0("memory used 4 :",get.ram.used.by.session.process()))
  print(sort( sapply(ls(),function(x){object.size(get(x))})))
  return( out )
}

In [5]:
get.ram.used.by.session.process()

In [4]:
Sys.setenv(CAS_CLIENT_SSL_CA_LIST = "/opt/sas/viya/config/etc/SASSecurityCertificateFramework/cacerts/trustedcerts.pem")
conn <- CAS('frasepviya35smp.cloud.com', 5570)

NOTE: Connecting to CAS and generating CAS action functions for loaded
      action sets...
NOTE: To generate the functions with signatures (for tab completion), set 
      options(cas.gen.function.sig=TRUE).


In [7]:
# Activate metric tracing and other session parameters
cas.sessionProp.setSessOpt(conn, metrics=TRUE, timeout=1800, caslib='casuser')

NOTE: 'CASUSER(viyademo01)' is now the active caslib.
NOTE: Action 'sessionProp.setSessOpt' used (Total process time):
NOTE:       real time               0.000492 seconds
NOTE:       cpu time                0.000466 seconds (94.72%)
NOTE:       total nodes             1 (16 cores)
NOTE:       total memory            125.75G
NOTE:       memory                  280.44K (0.00%)


In [5]:
cas.table.columnInfo(conn,table=list(caslib="public", name="test_data2"))

Column,Label,ID,Type,RawLength,FormattedLength,Format,NFL,NFD
nums1,,1,double,8,12,,0,0
nums2,,2,double,8,12,,0,0
nums3,,3,double,8,12,,0,0
nums4,,4,double,8,12,,0,0
nums5,,5,double,8,12,,0,0
chars1,,6,char,30,30,,0,0
chars2,,7,char,30,30,,0,0
chars3,,8,char,30,30,,0,0
chars4,,9,char,30,30,,0,0
chars5,,10,char,30,30,,0,0


In [8]:
get.ram.used.by.session.process()

In [6]:
Sys.time()
tbl <- defCasTable(conn, tablename="test_data3", caslib = "public")
tblR <- to.r.data.frame(tbl,obs = 1000000000000,commitSize=1000)
Sys.time()

[1] "2022-02-14 17:34:59 UTC"

[1] "memory used 1 :1150956"
[1] "memory used 2 :10853368"
[1] "memory used 3 :11223120"
[1] "Output object size ="
0 bytes
NULL
[1] "CASRetrieve Res object size ="
8430383080 bytes
[1] 5
[1] "list"
[1] "memory used 4 :11231680"
       out commitSize          i        obs    keyname         fv         tp 
         0         56         56         56        120        792       1464 
        ct        res 
      4656 8430383080 


[1] "2022-02-14 17:37:11 UTC"

In [14]:
get.ram.used.by.session.process()

In [15]:
cas.session.endSession(conn)

NOTE: Executing action 'session.endSession'.
NOTE: Action 'session.endSession' used (Total process time):
NOTE:       real time               0.000349 seconds
NOTE:       cpu time                0.000330 seconds (94.56%)
NOTE:       total nodes             1 (16 cores)
NOTE:       total memory            125.75G
NOTE:       memory                  227.16K (0.00%)


In [22]:
class(tblR)

In [9]:
str(tblR)

'data.frame':	1000000 obs. of  11 variables:
Formal class 'data.frame' [package ""] with 4 slots
  ..@ .Data    :List of 11
  .. ..$ : num  0.468 0.361 0.975 0.171 0.291 ...
  .. ..$ : num  0.15 0.158 0.909 0.187 0.475 ...
  .. ..$ : num  0.798 0.376 0.774 0.101 0.453 ...
  .. ..$ : num  0.6163 0.0405 0.1006 0.488 0.7491 ...
  .. ..$ : num  0.291 0.587 0.501 0.907 0.256 ...
  .. ..$ : chr  "WLBYy5d4ux2x8gkasJU26Qzns3dLlw" "AGjQjBAMA4" "GCSqGSIb3DQEBAQUAA4IBDwAw" "GjQjBAMA4GA1UdDwEB/wQEAwIBBjAP" ...
  .. ..$ : chr  "DAxMjgxMjAwMDBaMFcxCzAJBgNVBAY" "aMFcxCzAJBgNVBA" "zELMAkGA1UEBhM" "c1STE4U6G7weNLWLBYy5d4ux2x8gka" ...
  .. ..$ : chr  "rjsok6Vjk4bwY8iGlbKk3Fp1S4bInM" "z9SzANBgkqhkiG9w0BAQUFA" "jU/43d" "b2JhbFNpZ2" ...
  .. ..$ : chr  "/EqsLmVEQS98GPR4mdmzxzdzxtIK+6" "GA1UdDwEB/wQEAwIBBjAPBgNVHRMBA" "ugZitVtbNV4FpWi6cgKOOvyJBNPc1S" "IH" ...
  .. ..$ : chr  "2/7LqivjTFKDK1fPxsnCwrvQmeU79r" "gNVBAoTEEdsb2JhbFNpZ24gbnYtc2E" "TFKDK1fPxsnCwrvQmeU79" "KOOvyJBNPc1STE4U6G7weNLWLBYy5d" ...
  .. ..

In [10]:
Sys.time()
tblR2 <- to.r.data.frame(tbl,obs = 1000000000000,commitSize=1000)
Sys.time()

[1] "2022-02-14 11:51:44 UTC"

[1] "memory used 1 :1955760"


NOTE: Executing action 'table.fetch'.
NOTE: Action 'table.fetch' used (Total process time):
NOTE:       real time               10.402604 seconds
NOTE:       cpu time                6.024946 seconds (57.92%)
NOTE:       total nodes             1 (16 cores)
NOTE:       total memory            125.75G
NOTE:       memory                  32.34M (0.03%)


[1] "memory used 2 :2954540"
[1] "memory used 3 :2954540"
[1] "Output object size ="
97484472 bytes
[1] 1000000      11
[1] "CASRetrieve Res object size ="
842984288 bytes
[1] 5
[1] "list"
[1] "memory used 4 :3291580"
commitSize          i        obs    keyname         fv         tp         ct 
        56         56         56        120        792       1464       4656 
       out        res 
  97484472  842984288 


[1] "2022-02-14 11:52:01 UTC"

In [11]:
str(tblR2)

'data.frame':	1000000 obs. of  11 variables:
Formal class 'data.frame' [package ""] with 4 slots
  ..@ .Data    :List of 11
  .. ..$ : num  0.468 0.361 0.975 0.171 0.291 ...
  .. ..$ : num  0.15 0.158 0.909 0.187 0.475 ...
  .. ..$ : num  0.798 0.376 0.774 0.101 0.453 ...
  .. ..$ : num  0.6163 0.0405 0.1006 0.488 0.7491 ...
  .. ..$ : num  0.291 0.587 0.501 0.907 0.256 ...
  .. ..$ : chr  "WLBYy5d4ux2x8gkasJU26Qzns3dLlw" "AGjQjBAMA4" "GCSqGSIb3DQEBAQUAA4IBDwAw" "GjQjBAMA4GA1UdDwEB/wQEAwIBBjAP" ...
  .. ..$ : chr  "DAxMjgxMjAwMDBaMFcxCzAJBgNVBAY" "aMFcxCzAJBgNVBA" "zELMAkGA1UEBhM" "c1STE4U6G7weNLWLBYy5d4ux2x8gka" ...
  .. ..$ : chr  "rjsok6Vjk4bwY8iGlbKk3Fp1S4bInM" "z9SzANBgkqhkiG9w0BAQUFA" "jU/43d" "b2JhbFNpZ2" ...
  .. ..$ : chr  "/EqsLmVEQS98GPR4mdmzxzdzxtIK+6" "GA1UdDwEB/wQEAwIBBjAPBgNVHRMBA" "ugZitVtbNV4FpWi6cgKOOvyJBNPc1S" "IH" ...
  .. ..$ : chr  "2/7LqivjTFKDK1fPxsnCwrvQmeU79r" "gNVBAoTEEdsb2JhbFNpZ24gbnYtc2E" "TFKDK1fPxsnCwrvQmeU79" "KOOvyJBNPc1STE4U6G7weNLWLBYy5d" ...
  .. ..

In [9]:
sort(sapply(ls(),function(x){object.size(get(x))}))

In [14]:
library(arsenal)
comparedf(tblR,tblR2)

“Setting class(x) to NULL;   result will no longer be an S4 object”

Compare Object

Function Call: 
comparedf(x = tblR, y = tblR2)

Shared: 11 non-by variables and 1000000 observations.
Not shared: 0 variables and 0 observations.

Differences found in 0/11 variables compared.
0 variables compared have non-identical attributes.

In [15]:
object.size(tblR)

194183952 bytes

In [16]:
object.size(tblR2)

97484472 bytes

## END OF NOTEBOOK