Merge pull request #20 from ibm-watson-data-lab/R_ibmos2spark_COS_Sup…

…port R Cloud Object Storage Support
ibm-watson-data-lab · Aug 29, 2017 · f857cca · f857cca
2 parents 677b840 + c5eafb8
commit f857cca
Show file tree

Hide file tree

Showing 9 changed files with 163 additions and 159 deletions.
diff --git a/r/sparkr/DESCRIPTION b/r/sparkr/DESCRIPTION
@@ -1,13 +1,13 @@
 Package: ibmos2sparkR
 Title: Loads Object Store data into Softlayer and Bluemix
-Version: 0.0.7
+Version: 0.0.8
 Authors@R: person("Jim", "Crozier", email = "jim.crozier@ibm.com",
                   role = c("aut", "cre"))
-Description: Loads data from Object Store in Softlayer and Bluemix
+Description: Loads data from Object Store in Softlayer and Bluemix and ObjectStorage
 Depends:
     R (>= 3.1.0)
 License: Apache
 LazyData: true
 RoxygenNote: 5.0.1
-Imports: 
-  SparkR
+Imports:
+    SparkR
diff --git a/r/sparkr/NAMESPACE b/r/sparkr/NAMESPACE
@@ -1,6 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
+export(CloudObjectStorage)
 export(bluemix)
 export(softlayer)
+exportClasses(CloudObjectStorage)
 exportClasses(bluemix)
 exportClasses(softlayer)
diff --git a/r/sparkr/R/osconfig.R b/r/sparkr/R/osconfig.R
@@ -9,24 +9,24 @@ swifturl = function(name, container_name, object_name){
 
 
 #' sparkcontext is a SparkContext object.
-#' 
+#'
 #' name is a string that identifies this configuration. You can
 #'     use any string you like. This allows you to create
 #'     multiple configurations to different Object Storage accounts.
 #' auth_url, username and password are string credentials for your
 #' Softlayer Object Store
-#' @export softlayer     
+#' @export softlayer
 #' @exportClass softlayer
 
 softlayer <- setRefClass("softlayer",
   fields=list(name="character", container_name="character", object_name="character",
-             sparkcontext='jobj', auth_url="character", 
+             sparkcontext='jobj', auth_url="character",
               tenant = "character", username="character", password="character"),
-  methods=list(initialize = 
+  methods=list(initialize =
     function( sparkcontext, name, auth_url, tenant, username, password,public=FALSE,
-              swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){     
+              swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){
 
-        .self$name = name 
+        .self$name = name
         prefix = paste("fs.swift2d.service" , name, sep =".")
         hConf = SparkR:::callJMethod(sparkcontext, "hadoopConfiguration")
         SparkR:::callJMethod(hConf, "set", "fs.swift2d.impl", swift2d_driver)
@@ -41,21 +41,21 @@ softlayer <- setRefClass("softlayer",
         SparkR:::callJMethod(hConf, "set", paste(prefix, "use.get.auth", sep='.'), "true")
         invisible(SparkR:::callJMethod(hConf, "setBoolean", paste(prefix, "location-aware", sep='.'), FALSE))
         SparkR:::callJMethod(hConf, "set", paste(prefix, "password", sep='.'), password)
-        
-        
+
+
     },
-               
+
         url = function(container_name, object_name){
         return(swifturl(name, container_name, object_name))}
     )
 )
 
 
-          
+
 #' sparkcontext:  a SparkContext object.
 #'
 #' credentials:  a dictionary with the following required keys:
-#'   
+#'
 #'   auth_url
 #'   project_id (or projectId)
 #'   user_id (or userId)
@@ -73,28 +73,28 @@ softlayer <- setRefClass("softlayer",
 #' instances, the values for these credentials can be obtained
 #' by clicking on the 'insert to code' link just below a data
 #' source.
-#' @export bluemix     
+#' @export bluemix
 #' @exportClass bluemix
 
-          
+
 bluemix <- setRefClass("bluemix",
-  fields=list(name="character", credentials = "list", 
+  fields=list(name="character", credentials = "list",
              sparkcontext='jobj', public = "character"),
-  methods=list(initialize = 
+  methods=list(initialize =
     function(..., sparkcontext, name=NULL, credentials,
-             public=FALSE,swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){       
+             public=FALSE,swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){
 
       callSuper(...,credentials=credentials)
-        
+
       if ( is.null(name)) name <<- credentials["name"][[1]]
-           
+
       user_id = try( credentials['user_id'][[1]])
       if(class(user_id)=="try-error")  user_id = credentials['userId'][[1]]
-          
+
       tenant = try( credentials['project_id'][[1]])
       if(class(tenant)=="try-error")  tenant = credentials['projectId'][[1]]
-        
-        .self$name = name  
+
+        .self$name = name
         prefix = paste("fs.swift2d.service" , name, sep =".")
         hConf = SparkR:::callJMethod(sparkcontext, "hadoopConfiguration")
         SparkR:::callJMethod(hConf, "set", "fs.swift2d.impl", swift2d_driver)
@@ -108,8 +108,63 @@ bluemix <- setRefClass("bluemix",
         invisible(SparkR:::callJMethod(hConf, "setBoolean", paste(prefix, "public", sep='.'), public))
         #invisible(SparkR:::callJMethod(hConf, "setInt", paste(prefix, "http.port", sep='.'), 8080))
           },
-          
+
         url = function( container_name, object_name){
         return(swifturl(name, container_name, object_name))}
     )
-)
+)
+
+#' CloudObjectStorage is a class that is designed for IBM cloud object storage (COS)
+#' It sets up the hadoop config for COS and provide the final file url.
+#'
+#' sparkContext:  a SparkContext object.
+#''
+#' credentials:  a dictionary with the following required keys:
+#'   endpoint
+#'   accessKey
+#'   secretKey
+#'
+#' configurationName: string identifies the configurations that has been
+#' set.
+#' When using this from a IBM Spark service instance that
+#' is configured to connect to particular Bluemix object store
+#' instances, the values for these credentials can be obtained
+#' by clicking on the 'insert to code' link just below a data
+#' source.
+#' @export CloudObjectStorage
+#' @exportClass CloudObjectStorage
+CloudObjectStorage <- setRefClass("CloudObjectStorage",
+  fields=list(configName="character"),
+  methods=list(
+      initialize = function(..., sparkContext, credentials, configurationName){
+
+
+          if (is.null(credentials["endpoint"][[1]])) {
+              stop("Attribute endpoint in credentials is missing!")
+          }
+
+          if (is.null(credentials["accessKey"][[1]])) {
+              stop("Attribute accessKey in credentials is missing!")
+          }
+
+          if (is.null(credentials["secretKey"][[1]])) {
+              stop("Attribute secretKey in credentials is missing!")
+          }
+
+          .self$configName = configurationName
+          prefix = "fs.s3d.service"
+          hConf = SparkR:::callJMethod(sparkContext, "hadoopConfiguration")
+          SparkR:::callJMethod(hConf, "set", paste(prefix, "endpoint", sep='.'), credentials['endpoint'][[1]])
+          SparkR:::callJMethod(hConf, "set", paste(prefix, "access.key", sep='.'), credentials['accessKey'][[1]])
+          SparkR:::callJMethod(hConf, "set", paste(prefix, "secret.key", sep='.'), credentials['secretKey'][[1]])
+      },
+
+      getConfigName = function() {
+        return (.self$configName)
+      },
+
+      url = function(bucketName, objectName){
+          return(paste("s3d://", bucketName, ".service/", objectName, sep = ""))
+      }
+  )
+)
diff --git a/r/sparkr/README.md b/r/sparkr/README.md
@@ -1,23 +1,23 @@
 # ibmos2sparkR
 
-The package sets Spark Hadoop configurations for connecting to 
+The package sets Spark Hadoop configurations for connecting to
 IBM Bluemix Object Storage and Softlayer Account Object Storage instances. This packages uses the new [stocator](https://github.com/SparkTC/stocator) driver, which implements the `swift2d` protocol, and is availble
-on the latest IBM Apache Spark Service instances (and through IBM Data Science Experience). 
+on the latest IBM Apache Spark Service instances (and through IBM Data Science Experience).
 
-Using the `stocator` driver connects your Spark executor nodes directly 
+Using the `stocator` driver connects your Spark executor nodes directly
 to your data in object storage.
 This is an optimized, high-performance method to connect Spark to your data. All IBM Apache Spark kernels
-are instantiated with the `stocator` driver in the Spark kernel's classpath. 
-You can also run this locally by installing the [stocator driver](https://github.com/SparkTC/stocator) 
-and adding it to your local Apache Spark kernel's classpath. 
+are instantiated with the `stocator` driver in the Spark kernel's classpath.
+You can also run this locally by installing the [stocator driver](https://github.com/SparkTC/stocator)
+and adding it to your local Apache Spark kernel's classpath.
 
 
 This package expects a SparkContext instantiated by SparkR. It has been tested to work with
 IBM Spark service in R notebooks on IBM DSX, though it should work with other Spark installations
 that utilize the [swift2d/stocator](https://github.com/SparkTC/stocator) protocol.
 
 
-## Installation 
+## Installation
 
     library(devtools)
     devtools::install_url("https://github.com/ibm-cds-labs/ibmos2spark/archive/<version).zip", subdir= "r/sparkr/")
@@ -27,15 +27,39 @@ where `version` should be a tagged release, such as `0.0.7`. (If you're daring,
 ## Usage
 
 The usage of this package depends on *from where* your Object Storage instance was created. This package
-is intended to connect to IBM's Object Storage instances obtained from Bluemix or Data Science Experience 
-(DSX) or from a separate account on IBM Softlayer. The instructions below show how to connect to 
-either type of instance. 
+is intended to connect to IBM's Object Storage instances obtained from Bluemix or Data Science Experience
+(DSX) or from a separate account on IBM Softlayer. It also supports IBM cloud object storage (COS). The
+instructions below show how to connect to either type of instance.
 
 The connection setup is essentially the same. But the difference for you is how you deliver the
 credentials. If your Object Storage was created with Bluemix/DSX, with a few clicks on the side-tab
 within a DSX Jupyter notebook, you can obtain your account credentials in the form of a list.
 If your Object Storage was created with a Softlayer account, each part of the credentials will
-be found as text that you can copy and paste into the example code below. 
+be found as text that you can copy and paste into the example code below.
+
+### Cloud Object Storage
+    library(ibmos2sparkR)
+    configurationName = "bluemixO123"
+
+    # In DSX notebooks, the "insert to code" will insert this credentials list for you
+    credentials <- list(
+      accessKey = "123",
+      secretKey = "123",
+      endpoint = "https://s3-api.objectstorage.....net/"
+    )
+
+    cos <- CloudObjectStorage(sparkContext=sc, credentials=credentials, configurationName=configurationName)
+    bucketName <- "bucketName"
+    fileName <- "test.csv"
+    url <- cos$url(bucketName, fileName)
+
+    invisible(sparkR.session(appName = "SparkSession R"))
+
+    df.data.1 <- read.df(url,
+        source = "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat",
+        header = "true")
+    head(df.data.1)
+
 
 ### Bluemix / Data Science Experience
 
@@ -45,11 +69,11 @@ be found as text that you can copy and paste into the example code below.
     # In DSX notebooks, the "insert to code" will insert this credentials list for you
     creds = list(
             auth_url="https://identity.open.softlayer.com",
-            region="dallas", 
-            project_id = "XXXXX", 
-            user_id="XXXXX", 
+            region="dallas",
+            project_id = "XXXXX",
+            user_id="XXXXX",
             password="XXXXX")
-            
+
     bmconfig = bluemix(sparkcontext=sc, name=configurationname, credentials = creds)
 
     container = "my_container"
@@ -67,24 +91,24 @@ be found as text that you can copy and paste into the example code below.
     library(ibmos2sparkR)
     configurationname = "softlayerOScon" #can be any any name you like (allows for multiple configurations)
 
-    slconfig = softlayer(sparkcontext=sc, 
-                 name=configurationname, 
+    slconfig = softlayer(sparkcontext=sc,
+                 name=configurationname,
                  auth_url="https://identity.open.softlayer.com",
-                 tenant = "XXXXX", 
-                 username="XXXXX", 
+                 tenant = "XXXXX",
+                 username="XXXXX",
                  password="XXXXX"
            )
-           
+
     container = "my_container"
     object = "my_data.csv"
 
     data <- read.df(sqlContext, slconfig$url(container,object), source = "com.databricks.spark.csv", header = "true")
-    
+
     # OR, for Spark >= 2.0.0
 
     data = read.df(slconfig$url(container, objectname), source="com.databricks.spark.csv", header="true")
-    
-## License 
+
+## License
 
 Copyright 2016 IBM Cloud Data Services
 

diff --git a/r/sparkr/man/CloudObjectStorage-class.Rd b/r/sparkr/man/CloudObjectStorage-class.Rd
diff --git a/r/sparkr/man/bluemix-class.Rd b/r/sparkr/man/bluemix-class.Rd