Skip to content

Commit

Permalink
Merge pull request #20 from ibm-watson-data-lab/R_ibmos2spark_COS_Sup…
Browse files Browse the repository at this point in the history
…port

R Cloud Object Storage Support
  • Loading branch information
bassel-zeidan committed Aug 29, 2017
2 parents 677b840 + c5eafb8 commit f857cca
Show file tree
Hide file tree
Showing 9 changed files with 163 additions and 159 deletions.
8 changes: 4 additions & 4 deletions r/sparkr/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
Package: ibmos2sparkR
Title: Loads Object Store data into Softlayer and Bluemix
Version: 0.0.7
Version: 0.0.8
Authors@R: person("Jim", "Crozier", email = "jim.crozier@ibm.com",
role = c("aut", "cre"))
Description: Loads data from Object Store in Softlayer and Bluemix
Description: Loads data from Object Store in Softlayer and Bluemix and ObjectStorage
Depends:
R (>= 3.1.0)
License: Apache
LazyData: true
RoxygenNote: 5.0.1
Imports:
SparkR
Imports:
SparkR
2 changes: 2 additions & 0 deletions r/sparkr/NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Generated by roxygen2: do not edit by hand

export(CloudObjectStorage)
export(bluemix)
export(softlayer)
exportClasses(CloudObjectStorage)
exportClasses(bluemix)
exportClasses(softlayer)
101 changes: 78 additions & 23 deletions r/sparkr/R/osconfig.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@ swifturl = function(name, container_name, object_name){


#' sparkcontext is a SparkContext object.
#'
#'
#' name is a string that identifies this configuration. You can
#' use any string you like. This allows you to create
#' multiple configurations to different Object Storage accounts.
#' auth_url, username and password are string credentials for your
#' Softlayer Object Store
#' @export softlayer
#' @export softlayer
#' @exportClass softlayer

softlayer <- setRefClass("softlayer",
fields=list(name="character", container_name="character", object_name="character",
sparkcontext='jobj', auth_url="character",
sparkcontext='jobj', auth_url="character",
tenant = "character", username="character", password="character"),
methods=list(initialize =
methods=list(initialize =
function( sparkcontext, name, auth_url, tenant, username, password,public=FALSE,
swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){
swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){

.self$name = name
.self$name = name
prefix = paste("fs.swift2d.service" , name, sep =".")
hConf = SparkR:::callJMethod(sparkcontext, "hadoopConfiguration")
SparkR:::callJMethod(hConf, "set", "fs.swift2d.impl", swift2d_driver)
Expand All @@ -41,21 +41,21 @@ softlayer <- setRefClass("softlayer",
SparkR:::callJMethod(hConf, "set", paste(prefix, "use.get.auth", sep='.'), "true")
invisible(SparkR:::callJMethod(hConf, "setBoolean", paste(prefix, "location-aware", sep='.'), FALSE))
SparkR:::callJMethod(hConf, "set", paste(prefix, "password", sep='.'), password)


},

url = function(container_name, object_name){
return(swifturl(name, container_name, object_name))}
)
)



#' sparkcontext: a SparkContext object.
#'
#' credentials: a dictionary with the following required keys:
#'
#'
#' auth_url
#' project_id (or projectId)
#' user_id (or userId)
Expand All @@ -73,28 +73,28 @@ softlayer <- setRefClass("softlayer",
#' instances, the values for these credentials can be obtained
#' by clicking on the 'insert to code' link just below a data
#' source.
#' @export bluemix
#' @export bluemix
#' @exportClass bluemix


bluemix <- setRefClass("bluemix",
fields=list(name="character", credentials = "list",
fields=list(name="character", credentials = "list",
sparkcontext='jobj', public = "character"),
methods=list(initialize =
methods=list(initialize =
function(..., sparkcontext, name=NULL, credentials,
public=FALSE,swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){
public=FALSE,swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){

callSuper(...,credentials=credentials)

if ( is.null(name)) name <<- credentials["name"][[1]]

user_id = try( credentials['user_id'][[1]])
if(class(user_id)=="try-error") user_id = credentials['userId'][[1]]

tenant = try( credentials['project_id'][[1]])
if(class(tenant)=="try-error") tenant = credentials['projectId'][[1]]
.self$name = name

.self$name = name
prefix = paste("fs.swift2d.service" , name, sep =".")
hConf = SparkR:::callJMethod(sparkcontext, "hadoopConfiguration")
SparkR:::callJMethod(hConf, "set", "fs.swift2d.impl", swift2d_driver)
Expand All @@ -108,8 +108,63 @@ bluemix <- setRefClass("bluemix",
invisible(SparkR:::callJMethod(hConf, "setBoolean", paste(prefix, "public", sep='.'), public))
#invisible(SparkR:::callJMethod(hConf, "setInt", paste(prefix, "http.port", sep='.'), 8080))
},

url = function( container_name, object_name){
return(swifturl(name, container_name, object_name))}
)
)
)

#' CloudObjectStorage is a class that is designed for IBM cloud object storage (COS)
#' It sets up the hadoop config for COS and provide the final file url.
#'
#' sparkContext: a SparkContext object.
#''
#' credentials: a dictionary with the following required keys:
#' endpoint
#' accessKey
#' secretKey
#'
#' configurationName: string identifies the configurations that has been
#' set.
#' When using this from a IBM Spark service instance that
#' is configured to connect to particular Bluemix object store
#' instances, the values for these credentials can be obtained
#' by clicking on the 'insert to code' link just below a data
#' source.
#' @export CloudObjectStorage
#' @exportClass CloudObjectStorage
CloudObjectStorage <- setRefClass("CloudObjectStorage",
fields=list(configName="character"),
methods=list(
initialize = function(..., sparkContext, credentials, configurationName){


if (is.null(credentials["endpoint"][[1]])) {
stop("Attribute endpoint in credentials is missing!")
}

if (is.null(credentials["accessKey"][[1]])) {
stop("Attribute accessKey in credentials is missing!")
}

if (is.null(credentials["secretKey"][[1]])) {
stop("Attribute secretKey in credentials is missing!")
}

.self$configName = configurationName
prefix = "fs.s3d.service"
hConf = SparkR:::callJMethod(sparkContext, "hadoopConfiguration")
SparkR:::callJMethod(hConf, "set", paste(prefix, "endpoint", sep='.'), credentials['endpoint'][[1]])
SparkR:::callJMethod(hConf, "set", paste(prefix, "access.key", sep='.'), credentials['accessKey'][[1]])
SparkR:::callJMethod(hConf, "set", paste(prefix, "secret.key", sep='.'), credentials['secretKey'][[1]])
},

getConfigName = function() {
return (.self$configName)
},

url = function(bucketName, objectName){
return(paste("s3d://", bucketName, ".service/", objectName, sep = ""))
}
)
)
70 changes: 47 additions & 23 deletions r/sparkr/README.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
# ibmos2sparkR

The package sets Spark Hadoop configurations for connecting to
The package sets Spark Hadoop configurations for connecting to
IBM Bluemix Object Storage and Softlayer Account Object Storage instances. This packages uses the new [stocator](https://github.com/SparkTC/stocator) driver, which implements the `swift2d` protocol, and is availble
on the latest IBM Apache Spark Service instances (and through IBM Data Science Experience).
on the latest IBM Apache Spark Service instances (and through IBM Data Science Experience).

Using the `stocator` driver connects your Spark executor nodes directly
Using the `stocator` driver connects your Spark executor nodes directly
to your data in object storage.
This is an optimized, high-performance method to connect Spark to your data. All IBM Apache Spark kernels
are instantiated with the `stocator` driver in the Spark kernel's classpath.
You can also run this locally by installing the [stocator driver](https://github.com/SparkTC/stocator)
and adding it to your local Apache Spark kernel's classpath.
are instantiated with the `stocator` driver in the Spark kernel's classpath.
You can also run this locally by installing the [stocator driver](https://github.com/SparkTC/stocator)
and adding it to your local Apache Spark kernel's classpath.


This package expects a SparkContext instantiated by SparkR. It has been tested to work with
IBM Spark service in R notebooks on IBM DSX, though it should work with other Spark installations
that utilize the [swift2d/stocator](https://github.com/SparkTC/stocator) protocol.


## Installation
## Installation

library(devtools)
devtools::install_url("https://github.com/ibm-cds-labs/ibmos2spark/archive/<version).zip", subdir= "r/sparkr/")
Expand All @@ -27,15 +27,39 @@ where `version` should be a tagged release, such as `0.0.7`. (If you're daring,
## Usage

The usage of this package depends on *from where* your Object Storage instance was created. This package
is intended to connect to IBM's Object Storage instances obtained from Bluemix or Data Science Experience
(DSX) or from a separate account on IBM Softlayer. The instructions below show how to connect to
either type of instance.
is intended to connect to IBM's Object Storage instances obtained from Bluemix or Data Science Experience
(DSX) or from a separate account on IBM Softlayer. It also supports IBM cloud object storage (COS). The
instructions below show how to connect to either type of instance.

The connection setup is essentially the same. But the difference for you is how you deliver the
credentials. If your Object Storage was created with Bluemix/DSX, with a few clicks on the side-tab
within a DSX Jupyter notebook, you can obtain your account credentials in the form of a list.
If your Object Storage was created with a Softlayer account, each part of the credentials will
be found as text that you can copy and paste into the example code below.
be found as text that you can copy and paste into the example code below.

### Cloud Object Storage
library(ibmos2sparkR)
configurationName = "bluemixO123"

# In DSX notebooks, the "insert to code" will insert this credentials list for you
credentials <- list(
accessKey = "123",
secretKey = "123",
endpoint = "https://s3-api.objectstorage.....net/"
)

cos <- CloudObjectStorage(sparkContext=sc, credentials=credentials, configurationName=configurationName)
bucketName <- "bucketName"
fileName <- "test.csv"
url <- cos$url(bucketName, fileName)

invisible(sparkR.session(appName = "SparkSession R"))

df.data.1 <- read.df(url,
source = "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat",
header = "true")
head(df.data.1)


### Bluemix / Data Science Experience

Expand All @@ -45,11 +69,11 @@ be found as text that you can copy and paste into the example code below.
# In DSX notebooks, the "insert to code" will insert this credentials list for you
creds = list(
auth_url="https://identity.open.softlayer.com",
region="dallas",
project_id = "XXXXX",
user_id="XXXXX",
region="dallas",
project_id = "XXXXX",
user_id="XXXXX",
password="XXXXX")

bmconfig = bluemix(sparkcontext=sc, name=configurationname, credentials = creds)

container = "my_container"
Expand All @@ -67,24 +91,24 @@ be found as text that you can copy and paste into the example code below.
library(ibmos2sparkR)
configurationname = "softlayerOScon" #can be any any name you like (allows for multiple configurations)

slconfig = softlayer(sparkcontext=sc,
name=configurationname,
slconfig = softlayer(sparkcontext=sc,
name=configurationname,
auth_url="https://identity.open.softlayer.com",
tenant = "XXXXX",
username="XXXXX",
tenant = "XXXXX",
username="XXXXX",
password="XXXXX"
)

container = "my_container"
object = "my_data.csv"

data <- read.df(sqlContext, slconfig$url(container,object), source = "com.databricks.spark.csv", header = "true")

# OR, for Spark >= 2.0.0

data = read.df(slconfig$url(container, objectname), source="com.databricks.spark.csv", header="true")
## License

## License

Copyright 2016 IBM Cloud Data Services

Expand Down
26 changes: 26 additions & 0 deletions r/sparkr/man/CloudObjectStorage-class.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 3 additions & 17 deletions r/sparkr/man/bluemix-class.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f857cca

Please sign in to comment.