# R on Vertex AI Workbench Instances

How to use R on a Vertex AI Workbench Instance

## Setup

1.  Create a new vertex ai workbench instance  [https://cloud.google.com/vertex-ai/docs/workbench/instances/introduction](https://cloud.google.com/vertex-ai/docs/workbench/instances/introduction)     
2.  Open a terminal session from the Launcher     
3.  enter commands below in the terminal (one line at a time)
 
  
```sh
ENV_NAME=r-env
conda create -n $ENV_NAME r-essentials r-base rpy2
conda activate $ENV_NAME
python -m ipykernel install --user --name=$ENV_NAME
```
  
4.  Return to the Launcher screen and click the R kernel    


## Install R packages

In [None]:
required_packages <- c("bigrquery", "googleCloudStorageR" , "glue", "httr")
install.packages(setdiff(required_packages, rownames(installed.packages())))

## set constants 

### Email & Project ID

In [None]:
email <- "your-email@your-company-name.com"
project_id <- "your-project-id"
## try to set from gcloud if not specified 
if (email == "your-email@your-company-name.com") {
    email <- system("gcloud config get-value core/account", intern = TRUE)}
if (project_id == "your-project-id"){
    project_id <- system("gcloud config get-value project", intern = TRUE)}

In [None]:
print(email)

In [None]:
print(project_id)

### Other

In [None]:
experiment_name <- "r-kernel-workbench"

bucket_name <- paste0(project_id, "-" , experiment_name)

dataset_name <- gsub("-", "_", experiment_name)

table_name <- "californiahousing"

location <- "us-central1"

data_filename <- "california-housing-tabular-regression.csv"

## load R packages 

In [None]:
library(googleCloudStorageR)
library(bigrquery)
library(gargle)
library(glue)
# set when debugging
# options(gargle_verbosity = "debug") 

## Authenticate Clients

In [None]:
## set scope to enable access for each R package and use token based auth
## to simplify
### GCS - https://code.markedmondson.me/googleCloudStorageR/articles/googleCloudStorageR.html#token-authentication-1
### BQ - https://bigrquery.r-dbi.org/reference/bq_auth.html
scope <- c("https://www.googleapis.com/auth/cloud-platform")
token <- token_fetch(scopes = scope, email = email)

In [None]:
## authenticate with each service
gcs_auth(token = token)
bq_auth(token = token)

## Test connections

### List Google Cloud Storage Buckets

In [None]:
# buckets <- gcs_list_buckets(project_id)
# buckets

### List Bigquery datasets 

In [None]:
# datasets <- as.data.frame(do.call(rbind, bq_project_datasets(project_id)))
# datasets

## Example workflow

* get public sample data (CSV) file from public GCS bucket
* Create GCS staging bucket and destination BQ table
* load data from GCS to BQ
* query BQ data
* cleanup / delete all created resources

## create GCS Bucket 

In [None]:
create_bucket_if_not_exists <- function(project_id, bucket_name, location) {
    # Check if the bucket already exists by trying to retrieve it
    bucket_exists <- tryCatch({
      gcs_get_bucket(bucket_name)
      TRUE  # If no error, the bucket exists
    }, error = function(e) {
      message(paste("Bucket", bucket_name, "does not exist."))
      FALSE # If an error is thrown, the bucket doesn't exist
    })

    if (!bucket_exists) {
      # Create the bucket if it doesn't exist
      gcs_create_bucket(bucket_name,
                        projectId = project_id,
                        storageClass = "STANDARD",
                        location = location)
      message(paste("Bucket", bucket_name, "created successfully."))
    } else {
      message(paste("Bucket", bucket_name, "already exists."))
    }
}

In [None]:
create_bucket_if_not_exists(project_id = project_id, 
                            bucket_name = bucket_name,
                            location = location)

In [None]:
create_dataset_if_not_exists <- function(project_id, dataset_id, location) {
    
    # Check if the dataset already exists
    dataset_exists <- bq_dataset_exists(bq_dataset(project_id, dataset_id))  # Retrieve dataset information

  if (dataset_exists!=TRUE) {
    # Create the dataset if it doesn't exist
    bq_dataset_create(bq_dataset(project_id, dataset_id), location)
    message(paste("Dataset", dataset_id, "created successfully in project", project_id))
  } else {
    message(paste("Dataset", dataset_id, "already exists in project", project_id))
  }
}

In [None]:
create_dataset_if_not_exists(project_id = project_id, 
                             dataset_id = dataset_name, 
                             location = location)

In [None]:
## set global bucket first so we don't need to in future api calls
gcs_global_bucket(bucket_name)
gcs_list_objects()


In [None]:
## download file from bucket ------------------------------------------------
data_uri <- "gs://cloud-samples-data/ai-platform-unified/datasets/tabular/california-housing-tabular-regression.csv"
data_raw <- gcs_get_object(object_name = data_uri)

In [None]:
## inspect data to sanity check ---------------------------------------------
summary(data_raw)
head(data_raw)

## upload file to our bucket

In [None]:
gcs_upload(data_raw,
           name = data_filename,
           predefinedAcl = "bucketLevel")

In [None]:
## list to confirm upload  -----------------------------------------------
gcs_list_objects()

In [None]:
## Create new empty table -----------------------------------------------------
### name of bq table to create, save as object -----------------------------
bq_table_to_create <- bq_table(project_id, dataset_name, table_name)
bq_table_to_create

In [None]:
### set fields / schema -----------------------------------------------------
bq_fields_to_create <- as_bq_fields(
  list(
    list(name = "longitude", type = "STRING"),
    list(name = "latitude", type = "STRING"),
    list(name = "housing_median_age", type = "STRING"),
    list(name = "total_rooms", type = "STRING"),
    list(name = "total_bedrooms", type = "STRING"),
    list(name = "population", type = "STRING"),
    list(name = "households", type = "STRING"),
    list(name = "median_income", type = "STRING"),
    list(name = "median_house_value", type = "STRING")
    )
  )


In [None]:
## execute create table -----------------------------------------------------
bq_table_create(bq_table_to_create,
                bq_fields_to_create)

In [None]:
## load data from GCS to BQ table ------------------------------------------
bq_table_load(bq_table_to_create,
              source_uris = sprintf("gs://%s/%s",
                                    bucket_name,
                                    data_filename),
              source_format = "CSV",
              nskip = 1,
              create_disposition = "CREATE_IF_NEEDED",
              write_disposition = "WRITE_TRUNCATE")

In [None]:
## list tables to confirm creation ------------------------------------------
query <- sprintf("SELECT * FROM `%s.%s.%s` LIMIT 100",
                 project_id, dataset_name, table_name)


In [None]:
bq_query_results <- bq_project_query(project_id, query)

In [None]:
bq_data <- bq_table_download(bq_query_results)

In [None]:
head(bq_data)

## cleanup


### BQ - delete dataset and all underlying tables

In [None]:
bq_dataset_delete(bq_dataset(project_id, dataset_name), delete_contents = TRUE)

### GCS - Delete bucket and all objects within

In [None]:
gcs_delete_bucket(bucket_name, force_delete = TRUE)