# Sparklyr

Load libraries:

In [2]:
options(rsparkling.sparklingwater.version = "2.1.0")

In [3]:
library(sparklyr)
library(rsparkling)
library(h2o)
library(dplyr)

config <- spark_config()
config$sparklyr.gateway.port = 8881

sc <- spark_connect(master = "spark://s01:7077", config=config)


----------------------------------------------------------------------

Your next step is to start H2O:
    > h2o.init()

For H2O package documentation, ask for help:
    > ??h2o

After starting H2O, you can use the Web UI at http://localhost:54321
For more information visit http://docs.h2o.ai

----------------------------------------------------------------------


Attaching package: ‘h2o’

The following objects are masked from ‘package:stats’:

    cor, sd, var

The following objects are masked from ‘package:base’:

    ||, &&, %*%, apply, as.factor, as.numeric, colnames, colnames<-,
    ifelse, %in%, is.character, is.factor, is.numeric, log, log10,
    log1p, log2, round, signif, trunc


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



# Flights Dataset

Load flights parquet file from hdfs:

In [4]:
flights <- spark_read_parquet(sc, name="flights", path="/data/2000.parquet")

In [5]:
flights

Source:   query [5.683e+06 x 29]
Database: spark connection master=spark://s01:7077 app=sparklyr local=FALSE

# A tibble: 5.683e+06 x 29
    Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
   <int> <int>      <int>     <int>   <int>      <int>   <int>      <int>
 1  2000     3         10         5    1027       1024    1633       1639
 2  2000     3         11         6    1035       1024    1701       1639
 3  2000     3         12         7    1026       1024    1639       1639
 4  2000     3         13         1    1041       1024    1644       1639
 5  2000     3         14         2    1035       1024    1654       1639
 6  2000     3         15         3    1107       1024    1715       1639
 7  2000     3         16         4    1028       1024    1710       1639
 8  2000     3         17         5    1035       1024    1658       1639
 9  2000     3         18         6    1150       1024    1807       1639
10  2000     3         19         7    1129      

Create a logical attribute on weather the Departure Delay was longer than 15 minutes, if that's positive then it was delayed:

In [24]:
partitions3 <- flights %>%
  select(Month, DayofMonth, DayOfWeek, DepTime, UniqueCarrier, DepDelay, Origin, Dest, Distance) %>%
  na.omit() %>%
  mutate(Delayed = DepDelay > 15) %>%
  ft_string_indexer(input_col = "UniqueCarrier", output_col = "UC") %>%
  ft_string_indexer(input_col = "Origin", output_col = "From") %>%
  ft_string_indexer(input_col = "Dest", output_col = "To") %>%
  sdf_partition(training = 0.7, test = 0.3, seed = 1099)

* Dropped 187490 rows with 'na.omit' (5683047 => 5495557)


Check curated data that will go through ml:

In [27]:
partitions3

$training
Source:   query [3.846e+06 x 13]
Database: spark connection master=spark://s01:7077 app=sparklyr local=FALSE

# A tibble: 3.846e+06 x 13
   Month DayofMonth DayOfWeek DepTime UniqueCarrier DepDelay Origin  Dest
   <int>      <int>     <int>   <int>         <chr>    <int>  <chr> <chr>
 1     3          1         3      40            HP        0    LAS   CMH
 2     3          1         3      45            UA      140    ORD   DEN
 3     3          1         3      55            DL        0    SLC   CVG
 4     3          1         3      56            DL       -5    PHX   CVG
 5     3          1         3     128            US       -2    LAS   CLT
 6     3          1         3     526            DL       -4    DEN   CVG
 7     3          1         3     537            AA       -3    ABQ   DFW
 8     3          1         3     549            DL       -1    BOS   CVG
 9     3          1         3     550            DL        0    LEX   CVG
10     3          1         3     550  

In [28]:
training <- as_h2o_frame(sc, partitions3$training, strict_version_check = FALSE)
test <- as_h2o_frame(sc, partitions3$test, strict_version_check = FALSE)

“
Your H2O cluster version is too old (4 months and 11 days)!
“
Your H2O cluster version is too old (4 months and 11 days)!
Please download and install the latest version from http://h2o.ai/download/”

# Logistic Regression

In [29]:
# fit a linear model to the training dataset
glm_model <- h2o.glm(x = c("Month", "DayofMonth", "DayOfWeek", "DepTime", "UC", "From", "To", "Distance"), 
                     y = "Delayed",
                     training_frame = training,
                     lambda_search = TRUE)



In [30]:
summary(glm_model)

Model Details:

H2ORegressionModel: glm
Model Key:  GLM_model_R_1497252297018_2 
GLM Model: summary
    family     link                                regularization
1 gaussian identity Elastic Net (alpha = 0.5, lambda = 1.764E-5 )
                                                                 lambda_search
1 nlambda = 100, lambda.max = 0.1764, lambda.min = 1.764E-5, lambda.1se = -1.0
  number_of_predictors_total number_of_active_predictors number_of_iterations
1                          8                           8                    0
                                  training_frame
1 frame_rdd_220_b806f63db9ea22cd2209b3a111104de1

H2ORegressionMetrics: glm
** Reported on training data. **

MSE:  0.1483828
RMSE:  0.3852049
MAE:  0.2972921
RMSLE:  0.2706393
Mean Residual Deviance :  0.1483828
R^2 :  0.05669767
Null Deviance :604981.3
Null D.o.F. :3845998
Residual Deviance :570680.3
Residual D.o.F. :3845990
AIC :3576482





Scoring History: 
            timestamp   duration iterati

Check prediction quality:

In [31]:
# compute predicted values on our test dataset
pred <- h2o.predict(glm_model, newdata = test)
# convert from H2O Frame to Spark DataFrame
predicted <- as_spark_dataframe(sc, pred, strict_version_check = FALSE)



“
Your H2O cluster version is too old (4 months and 11 days)!
Please download and install the latest version from http://h2o.ai/download/”

In [33]:
# extract the true 'mpg' values from our test dataset
actual <- partitions3$test %>%
  select(Delayed) %>%
  collect() %>%
  `[[`("Delayed")

In [34]:
# produce a data.frame housing our predicted + actual 'mpg' values
data <- data.frame(
  predicted = predicted,
  actual    = actual
)
# a bug in data.frame does not set colnames properly; reset here 
names(data) <- c("predicted", "actual")

In [35]:
data

predicted,actual
-0.008328269,FALSE
0.008605790,FALSE
-0.026688491,FALSE
-0.010840626,FALSE
0.020674521,FALSE
0.011347967,FALSE
-0.008663529,FALSE
0.026300316,FALSE
0.011671731,FALSE
0.035242296,FALSE


In [41]:
desision_point <- (max(data$predicted)-min(data$predicted))/2 +min(data$predicted)

In [42]:
desision_point

In [45]:
data$pr_bin <- if_else(data$predicted > desision_point, 1, 0, missing = NULL)

In [46]:
data

predicted,actual,pr_bin
-0.008328269,FALSE,0
0.008605790,FALSE,0
-0.026688491,FALSE,0
-0.010840626,FALSE,0
0.020674521,FALSE,0
0.011347967,FALSE,0
-0.008663529,FALSE,0
0.026300316,FALSE,0
0.011671731,FALSE,0
0.035242296,FALSE,0


In [48]:
sum(data$pr_bin)/nrow(data)

# K-Means

In [50]:
names(training)

In [58]:
kmeans_model <- h2o.kmeans(training_frame = training, 
                           x = c(6,9),
                           k = 5,
                           seed = 1)



In [59]:
# print the cluster centers
h2o.centers(kmeans_model)

depdelay,distance
1.487323,761.6374
1009.734127,963.1508
264.655266,919.3419
124.801706,832.6068
47.692978,778.6291


In [60]:
# print the centroid statistics
h2o.centroid_stats(kmeans_model)

centroid,size,within_cluster_sum_of_squares
1,3303294,3428910.69
2,504,35232.31
3,17338,93860.25
4,107991,195480.21
5,416872,522278.92
