# [Learning Spark Second Edition](https://github.com/databricks/LearningSparkV2)


_almond_ 

In [3]:
import $ivy.`org.apache.spark::spark-sql:3.0.0` // Or use any other 2.x version here
import org.apache.spark.sql._

[32mimport [39m[36m$ivy.$                                   // Or use any other 2.x version here
[39m
[32mimport [39m[36morg.apache.spark.sql._[39m

In [4]:
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)

[32mimport [39m[36morg.apache.log4j.{Level, Logger}
[39m

### [Chapter 8](https://learning.oreilly.com/library/view/Learning+Spark,+2nd+Edition/9781492050032/ch08.html#executor_memory_layout)
> Optimizing and Tuning Spark for Efficiency

In [5]:
// In Scala
import org.apache.spark.sql.SparkSession

def printConfigs(session: SparkSession) = {
   // Get conf
   val mconf = session.conf.getAll
   // Print them
   for (k <- mconf.keySet) { println(s"${k} -> ${mconf(k)}\n") }
}

def main(args: Array[String]) {
 // Create a session
 val spark = SparkSession.builder
   .config("spark.sql.shuffle.partitions", 5)
   .config("spark.executor.memory", "2g")
   .master("local[*]")
   .appName("SparkConfig")
   .getOrCreate()

 printConfigs(spark)
 spark.conf.set("spark.sql.shuffle.partitions",
   spark.sparkContext.defaultParallelism)
 println(" ****** Setting Shuffle Partitions to Default Parallelism")
 printConfigs(spark)
}

[32mimport [39m[36morg.apache.spark.sql.SparkSession

[39m
defined [32mfunction[39m [36mprintConfigs[39m
defined [32mfunction[39m [36mmain[39m

In [6]:
main(Array[String]())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties


spark.driver.host -> 623c4a2181fc

spark.driver.port -> 36607

spark.app.name -> SparkConfig

spark.executor.id -> driver

spark.master -> local[*]

spark.executor.memory -> 2g

spark.app.id -> local-1595367206220

spark.sql.shuffle.partitions -> 5

 ****** Setting Shuffle Partitions to Default Parallelism
spark.driver.host -> 623c4a2181fc

spark.driver.port -> 36607

spark.app.name -> SparkConfig

spark.executor.id -> driver

spark.master -> local[*]

spark.executor.memory -> 2g

spark.app.id -> local-1595367206220

spark.sql.shuffle.partitions -> 8



### [Chapter 8](https://learning.oreilly.com/library/view/Learning+Spark,+2nd+Edition/9781492050032/ch08.html#executor_memory_layout)
> A Family of Spark Joins

In [10]:
//Shuffle Sort Merge Join
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SaveMode
import scala.util.Random

  // curried function to benchmark any code or function
  def benchmark(name: String)(f: => Unit) {
    val startTime = System.nanoTime
    f
    val endTime = System.nanoTime
    println(s"Time taken in $name: " + (endTime - startTime).toDouble / 1000000000 + " seconds")
  }

  // main class setting the configs
  def main (args: Array[String] ) {

    val spark = SparkSession.builder
        .appName("SortMergeJoin")
        .config("spark.sql.codegen.wholeStage", true)
        .config("spark.sql.join.preferSortMergeJoin", true)
        .config("spark.sql.autoBroadcastJoinThreshold", -1)
        .config("spark.sql.defaultSizeInBytes", 100000)
        .config("spark.sql.shuffle.partitions", 16)
        .getOrCreate ()

    import spark.implicits._

    var states = scala.collection.mutable.Map[Int, String]()
    var items = scala.collection.mutable.Map[Int, String]()
    val rnd = new scala.util.Random(42)

    // initialize states and items purchased
    states += (0 -> "AZ", 1 -> "CO", 2-> "CA", 3-> "TX", 4 -> "NY", 5-> "MI")
    items += (0 -> "SKU-0", 1 -> "SKU-1", 2-> "SKU-2", 3-> "SKU-3", 4 -> "SKU-4", 5-> "SKU-5")
    // create dataframes
    val usersDF = (0 to 100000).map(id => (id, s"user_${id}", s"user_${id}@databricks.com", 
                                           states(rnd.nextInt(5))))
          .toDF("uid", "login", "email", "user_state")
    val ordersDF = (0 to 100000).map(r => (r, r, rnd.nextInt(100000), 10 * r* 0.2d,
                                           states(rnd.nextInt(5)), items(rnd.nextInt(5))))
          .toDF("transaction_id", "quantity", "users_id", "amount", "state", "items")

    usersDF.show(10)
    ordersDF.show(10)

    // do a Join
    val usersOrdersDF = ordersDF.join(usersDF, $"users_id" === $"uid")
    usersOrdersDF.show(10, false)
    usersOrdersDF.cache()
    usersOrdersDF.explain()
    // usersOrdersDF.explain("formated")
    // uncoment to view the SparkUI otherwise the program terminates and shutdowsn the UI
    // Thread.sleep(200000000)
  }


[32mimport [39m[36morg.apache.spark.sql.SparkSession
[39m
[32mimport [39m[36morg.apache.spark.sql.types._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36morg.apache.spark.sql.SaveMode
[39m
[32mimport [39m[36mscala.util.Random

  // curried function to benchmark any code or function
  [39m
defined [32mfunction[39m [36mbenchmark[39m
defined [32mfunction[39m [36mmain[39m

In [9]:
main(Array[String]())

+---+------+--------------------+----------+
|uid| login|               email|user_state|
+---+------+--------------------+----------+
|  0|user_0|user_0@databricks...|        AZ|
|  1|user_1|user_1@databricks...|        TX|
|  2|user_2|user_2@databricks...|        TX|
|  3|user_3|user_3@databricks...|        NY|
|  4|user_4|user_4@databricks...|        AZ|
|  5|user_5|user_5@databricks...|        AZ|
|  6|user_6|user_6@databricks...|        AZ|
|  7|user_7|user_7@databricks...|        TX|
|  8|user_8|user_8@databricks...|        NY|
|  9|user_9|user_9@databricks...|        TX|
+---+------+--------------------+----------+
only showing top 10 rows

+--------------+--------+--------+------+-----+-----+
|transaction_id|quantity|users_id|amount|state|items|
+--------------+--------+--------+------+-----+-----+
|             0|       0|   78432|   0.0|   AZ|SKU-0|
|             1|       1|   84041|   2.0|   AZ|SKU-3|
|             2|       2|   34657|   4.0|   NY|SKU-1|
|             3|    

In [11]:
//OPTIMIZING THE SHUFFLE SORT MERGE JOIN

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.storage.StorageLevel._
import org.apache.spark.sql.SaveMode
import scala.util.Random


  // curried function to benchmark any code or function
  def benchmark(name: String)(f: => Unit) {
    val startTime = System.nanoTime
    f
    val endTime = System.nanoTime
    println(s"Time taken in $name: " + (endTime - startTime).toDouble / 1000000000 + " seconds")
  }

  // main class setting the configs
  def main (args: Array[String] ) {

    val spark = SparkSession.builder
        .appName("SortMergeJoinBucketed")
        .config("spark.sql.codegen.wholeStage", true)
        .config("spark.sql.join.preferSortMergeJoin", true)
        .config("spark.sql.autoBroadcastJoinThreshold", -1)
        .config("spark.sql.defaultSizeInBytes", 100000)
        .config("spark.sql.shuffle.partitions", 16)
        .getOrCreate ()

    import spark.implicits._

    var states = scala.collection.mutable.Map[Int, String]()
    var items = scala.collection.mutable.Map[Int, String]()
    val rnd = new scala.util.Random(42)

    // initialize states and items purchased
    states += (0 -> "AZ", 1 -> "CO", 2-> "CA", 3-> "TX", 4 -> "NY", 5-> "MI")
    items += (0 -> "SKU-0", 1 -> "SKU-1", 2-> "SKU-2", 3-> "SKU-3", 4 -> "SKU-4", 5-> "SKU-5")
    // create dataframes
    val usersDF = (0 to 100000).map(id => (id, s"user_${id}", s"user_${id}@databricks.com", states(rnd.nextInt(5))))
          .toDF("uid", "login", "email", "user_state")
    val ordersDF = (0 to 100000).map(r => (r, r, rnd.nextInt(100000), 10 * r* 0.2d, states(rnd.nextInt(5)), items(rnd.nextInt(5))))
          .toDF("transaction_id", "quantity", "users_id", "amount", "state", "items")

    // cache them on Disk only so we can see the difference in size in the storage UI
    usersDF.persist(DISK_ONLY)
    ordersDF.persist(DISK_ONLY)

    // let's create five buckets, each DataFrame for their respective columns
    // create bucket and table for uid
    spark.sql("DROP TABLE IF EXISTS UsersTbl")
    usersDF.orderBy(asc("uid"))
      .write.format("parquet")
      .mode(SaveMode.Overwrite)
      // eual to number of cores I have on my laptop
      .bucketBy(8, "uid")
      .saveAsTable("UsersTbl")
      // create bucket and table for users_id
    spark.sql("DROP TABLE IF EXISTS OrdersTbl")
    ordersDF.orderBy(asc("users_id"))
      .write.format("parquet")
      .bucketBy(8, "users_id")
      .mode(SaveMode.Overwrite)
      .saveAsTable("OrdersTbl")
    // cache tables in memory so that we can see the difference in size in the storage UI
    spark.sql("CACHE TABLE UsersTbl")
    spark.sql("CACHE TABLE OrdersTbl")
    spark.sql("SELECT * from OrdersTbl LIMIT 20")
    // read data back in
    val usersBucketDF = spark.table("UsersTbl")
    val ordersBucketDF = spark.table("OrdersTbl")
    // Now do the join on the bucketed DataFrames
    val joinUsersOrdersBucketDF = ordersBucketDF.join(usersBucketDF, $"users_id" === $"uid")
    joinUsersOrdersBucketDF.show(false)
    joinUsersOrdersBucketDF.explain()
    //joinUsersOrdersBucketDF.explain("formatted")

    // uncomment to view the SparkUI otherwise the program terminates and shutdowsn the UI
    // Thread.sleep(200000000)
  }


[32mimport [39m[36morg.apache.spark.sql.SparkSession
[39m
[32mimport [39m[36morg.apache.spark.sql.types._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36morg.apache.spark.storage.StorageLevel._
[39m
[32mimport [39m[36morg.apache.spark.sql.SaveMode
[39m
[32mimport [39m[36mscala.util.Random


  // curried function to benchmark any code or function
  [39m
defined [32mfunction[39m [36mbenchmark[39m
defined [32mfunction[39m [36mmain[39m

In [12]:
main(Array[String]())

+--------------+--------+--------+--------+-----+-----+---+--------+-----------------------+----------+
|transaction_id|quantity|users_id|amount  |state|items|uid|login   |email                  |user_state|
+--------------+--------+--------+--------+-----+-----+---+--------+-----------------------+----------+
|85775         |85775   |13      |171550.0|AZ   |SKU-2|13 |user_13 |user_13@databricks.com |CA        |
|79730         |79730   |14      |159460.0|AZ   |SKU-0|14 |user_14 |user_14@databricks.com |CO        |
|5436          |5436    |18      |10872.0 |CA   |SKU-3|18 |user_18 |user_18@databricks.com |TX        |
|47648         |47648   |38      |95296.0 |NY   |SKU-3|38 |user_38 |user_38@databricks.com |TX        |
|77507         |77507   |38      |155014.0|NY   |SKU-2|38 |user_38 |user_38@databricks.com |TX        |
|50588         |50588   |46      |101176.0|CA   |SKU-1|46 |user_46 |user_46@databricks.com |CA        |
|10811         |10811   |67      |21622.0 |NY   |SKU-0|67 |user_