In [0]:
%r

In [0]:
%r
library(SparkR)
library(sparklyr)
library(dplyr) 

In [0]:
%r
sc <- spark_connect(method= "databricks")

To upload a file to DBFS - Databricks File Storage, in the Workspace, select:

File > Upload data to DBFS

This is accessible to people with access to workspace

In [0]:
%r
jsonDF <- spark_read_json(
  sc      = sc,
  name    = "jsonTable",
  path    = "/FileStore/tables/books.json",
  options = list("multiLine" = TRUE),
  columns = c(
    author    = "character",
    country   = "character",
    imageLink = "character",
    language  = "character",
    link      = "character",
    pages     = "integer",
    title     = "character",
    year      = "integer"
  )
)



In [0]:
%r
head(jsonDF)

In [0]:
%r
show(jsonDF)

In [0]:
%r
collect(jsonDF)

In [0]:
%r  
 group_by(jsonDF, author) %>%
  count() %>%
  filter(author == "Chinua Achebe") %>%
  arrange(desc(n)) 
  


In [0]:
%r  
group_by(jsonDF, author) %>%
  count() %>%
  arrange(desc(n))

In [0]:
%r  
withDate <- jsonDF %>%
  mutate(today = current_timestamp())

collect(withDate)

In [0]:
%r  
withMMyyyy <- withDate %>%
  mutate(month = month(today),
         year  = year(today))

collect(select(withMMyyyy, c("author", "title", "month", "year")))

In [0]:
%r  
withUnixTimestamp <- withMMyyyy %>%
  mutate(formatted_date = date_format(today, "yyyy-MM-dd"),
         day            = dayofmonth(formatted_date))

collect(select(withUnixTimestamp, c("title", "formatted_date", "day")))




In [0]:
%r
# Ensure withTimestampDF is created
withTimestampDF <- createDataFrame(data.frame(
  author = c("Chinua Achebe", "Hans Christian Andersen"),
  country = c("Nigeria", "Denmark"),
  language = c("English", "Danish"),
  year = c(1958, 1836)
))

# Create a temporary view
createOrReplaceTempView(withTimestampDF, viewName = "timestampTable")


In [0]:
%r  
spark_read_table(
  sc = sc,
  name = "timestampTable"
) %>% collect()


In [0]:
%r  
irisDF <- sdf_copy_to(
  sc        = sc,
  x         = iris,
  name      = "iris",
  overwrite = TRUE
)

sdf_collect(irisDF, "row-wise")

In [0]:
%r  
data("iris")

In [0]:
%r  
head(iris) 

In [0]:
%r  
quantileDF <- irisDF %>%
  group_by(Species) %>%
  summarize(
    quantile_25th = percentile_approx(
      Sepal_Length,
      0.25
    ),
    quantile_50th = percentile_approx(
      Sepal_Length,
      0.50
    ),
    quantile_75th = percentile_approx(
      Sepal_Length,
      0.75
    ),
    quantile_100th = percentile_approx(
      Sepal_Length,
      1.0
    )
  )

collect(quantileDF)

In [0]:
%r  
print(sdf_quantile(
  x = irisDF %>%
    filter(Species == "virginica"),
  column = "Sepal_Length",
  probabilities = c(0.25, 0.5, 0.75, 1.0)
))

In [0]:
%r  
spark_disconnect(sc)