# [Learning Spark Second Edition](https://github.com/databricks/LearningSparkV2)


 _almond_ 

This notebook contains the examples of [Chapter 6 of the book Learn Spark](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch06.html). 

In [12]:
import $ivy.`org.apache.spark::spark-sql:3.0.0` // Or use any other 2.x version here
import org.apache.spark.sql._

[32mimport [39m[36m$ivy.$                                   // Or use any other 2.x version here
[39m
[32mimport [39m[36morg.apache.spark.sql._[39m

In [13]:
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)

[32mimport [39m[36morg.apache.log4j.{Level, Logger}
[39m

In [14]:
import org.apache.spark.sql.functions.avg
import org.apache.spark.sql.SparkSession


// Create a DataFrame using SparkSession
val spark = SparkSession
  .builder
  .master("local[*]")
  .appName("AuthorsAges")
  .getOrCreate()

import spark.implicits._

[32mimport [39m[36morg.apache.spark.sql.functions.avg
[39m
[32mimport [39m[36morg.apache.spark.sql.SparkSession


// Create a DataFrame using SparkSession
[39m
[36mspark[39m: [32mSparkSession[39m = org.apache.spark.sql.SparkSession@4ef4f545
[32mimport [39m[36mspark.implicits._[39m

### [Chapter 6](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch06.html)
> Scala Case Classes for Datasets

In [15]:
//Scala
case class Bloggers(Id:String, First:String, Last:String, Url:String,Hits: String, Campaigns:Array[String])

defined [32mclass[39m [36mBloggers[39m

In [16]:
val bloggers = "../databricks-datasets/learning-spark-v2/blogs.json"
val bloggersDS = spark
  .read
  .option("multiline","true")
  .format("json")
  .option("path", bloggers)
  .load()
  .as[Bloggers]

bloggersDS.show(3)

+--------------------+------+----+---+-----+---------+-----------------+
|           Campaigns| First|Hits| Id| Last|Published|              Url|
+--------------------+------+----+---+-----+---------+-----------------+
| [twitter, LinkedIn]| Jules|4535|  1|Damji| 1/4/2016|https://tinyurl.1|
| [twitter, LinkedIn]|Brooke|8908|  2|Wenig| 5/5/2018|https://tinyurl.2|
|[web, twitter, FB...| Denny|7659|  3|  Lee| 6/7/2019|https://tinyurl.3|
+--------------------+------+----+---+-----+---------+-----------------+
only showing top 3 rows



[36mbloggers[39m: [32mString[39m = [32m"../databricks-datasets/learning-spark-v2/blogs.json"[39m
[36mbloggersDS[39m: [32mDataset[39m[[32mBloggers[39m] = [Campaigns: array<string>, First: string ... 5 more fields]

### [Chapter 6](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch06.html)
> Creating Sample Data

In [19]:
//Scala
import scala.util.Random._

// Our case class for the Dataset
case class Usage(uid:Int, uname:String, usage: Int)

val r = new scala.util.Random(42)

// Create 1000 instances of scala Usage class 
// This generates data on the fly
val data = for (i <- 0 to 1000) 
  yield (Usage(i, "user-" + r.alphanumeric.take(5).mkString(""),
  r.nextInt(1000)))

[32mimport [39m[36mscala.util.Random._

// Our case class for the Dataset
[39m
defined [32mclass[39m [36mUsage[39m
[36mr[39m: [32mscala[39m.[32mutil[39m.[32mRandom[39m = scala.util.Random@65aa839a
[36mdata[39m: [32mcollection[39m.[32mimmutable[39m.[32mIndexedSeq[39m[[32mUsage[39m] = [33mVector[39m(
  [33mUsage[39m([32m0[39m, [32m"user-Gpi2C"[39m, [32m525[39m),
  [33mUsage[39m([32m1[39m, [32m"user-DgXDi"[39m, [32m502[39m),
  [33mUsage[39m([32m2[39m, [32m"user-M66yO"[39m, [32m170[39m),
  [33mUsage[39m([32m3[39m, [32m"user-xTOn6"[39m, [32m913[39m),
  [33mUsage[39m([32m4[39m, [32m"user-3xGSz"[39m, [32m246[39m),
  [33mUsage[39m([32m5[39m, [32m"user-2aWRN"[39m, [32m727[39m),
  [33mUsage[39m([32m6[39m, [32m"user-EzZY1"[39m, [32m65[39m),
  [33mUsage[39m([32m7[39m, [32m"user-ZlZMZ"[39m, [32m935[39m),
  [33mUsage[39m([32m8[39m, [32m"user-VjxeG"[39m, [32m756[39m),
  [33mUsage[39m([32m9[39m, [3

In [20]:
// Create a Dataset of Usage typed data
val dsUsage = spark.createDataset(data)
dsUsage.show(10)

+---+----------+-----+
|uid|     uname|usage|
+---+----------+-----+
|  0|user-Gpi2C|  525|
|  1|user-DgXDi|  502|
|  2|user-M66yO|  170|
|  3|user-xTOn6|  913|
|  4|user-3xGSz|  246|
|  5|user-2aWRN|  727|
|  6|user-EzZY1|   65|
|  7|user-ZlZMZ|  935|
|  8|user-VjxeG|  756|
|  9|user-iqf1P|    3|
+---+----------+-----+
only showing top 10 rows



[36mdsUsage[39m: [32mDataset[39m[[32mUsage[39m] = [uid: int, uname: string ... 1 more field]

### [Chapter 6](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch06.html)
> Transforming Sample Data

In [21]:
// In Scala
import scala.util.Random._
import org.apache.spark.sql.functions._

// Our case class for the Dataset
case class Usage(uid:Int, uname:String, usage: Int)
val r = new scala.util.Random(42)

// Create 1000 instances of scala Usage class 
// This generates data on the fly
val data = for (i <- 0 to 1000) 
  yield (Usage(i, "user-" + r.alphanumeric.take(5).mkString(""),
  r.nextInt(1000)))


[32mimport [39m[36mscala.util.Random._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._

// Our case class for the Dataset
[39m
defined [32mclass[39m [36mUsage[39m
[36mr[39m: [32mscala[39m.[32mutil[39m.[32mRandom[39m = scala.util.Random@1dc77ea7
[36mdata[39m: [32mcollection[39m.[32mimmutable[39m.[32mIndexedSeq[39m[[32mUsage[39m] = [33mVector[39m(
  [33mUsage[39m([32m0[39m, [32m"user-Gpi2C"[39m, [32m525[39m),
  [33mUsage[39m([32m1[39m, [32m"user-DgXDi"[39m, [32m502[39m),
  [33mUsage[39m([32m2[39m, [32m"user-M66yO"[39m, [32m170[39m),
  [33mUsage[39m([32m3[39m, [32m"user-xTOn6"[39m, [32m913[39m),
  [33mUsage[39m([32m4[39m, [32m"user-3xGSz"[39m, [32m246[39m),
  [33mUsage[39m([32m5[39m, [32m"user-2aWRN"[39m, [32m727[39m),
  [33mUsage[39m([32m6[39m, [32m"user-EzZY1"[39m, [32m65[39m),
  [33mUsage[39m([32m7[39m, [32m"user-ZlZMZ"[39m, [32m935[39m),
  [33mUsage[39m([32m8[39m, [32m"user-Vj

In [22]:
// Create a Dataset of Usage typed data
val ds = spark.createDataset(data)
ds.show(3)


+---+----------+-----+
|uid|     uname|usage|
+---+----------+-----+
|  0|user-Gpi2C|  525|
|  1|user-DgXDi|  502|
|  2|user-M66yO|  170|
+---+----------+-----+
only showing top 3 rows



[36mds[39m: [32mDataset[39m[[32mUsage[39m] = [uid: int, uname: string ... 1 more field]

In [23]:
ds.filter(d => d.usage > 100).orderBy(desc("usage")).show(10)

+---+----------+-----+
|uid|     uname|usage|
+---+----------+-----+
|605|user-NL6c4|  999|
|113|user-nnAXr|  999|
|561|user-5n2xY|  999|
|634|user-L0wci|  999|
|805|user-LX27o|  996|
|345|user-QKrVb|  996|
| 26|user-CJY3C|  996|
| 49|user-xPBrB|  993|
|681|user-QwV36|  992|
|626|user-63wkI|  992|
+---+----------+-----+
only showing top 10 rows



In [24]:
// In Scala
// Use an if-then-else lambda expression and compute a value
ds.map(u => {if (u.usage > 750) u.usage * .15 else u.usage * .50 })
  .show(5, false)

+------+
|value |
+------+
|262.5 |
|251.0 |
|85.0  |
|136.95|
|123.0 |
+------+
only showing top 5 rows



In [25]:

// Create a new case class with an additional field, cost
case class UsageCost(uid: Int, uname:String, usage: Int, cost: Double)

// Compute the usage cost with Usage as a parameter
// Return a new object, UsageCost
def computeUserCostUsage(u: Usage): UsageCost = {
  val v = if (u.usage > 750) u.usage * 0.15 else u.usage * 0.50
    UsageCost(u.uid, u.uname, u.usage, v)
}

// Use map() on our original Dataset
ds.map(u => {computeUserCostUsage(u)}).show(5)

+---+----------+-----+------+
|uid|     uname|usage|  cost|
+---+----------+-----+------+
|  0|user-Gpi2C|  525| 262.5|
|  1|user-DgXDi|  502| 251.0|
|  2|user-M66yO|  170|  85.0|
|  3|user-xTOn6|  913|136.95|
|  4|user-3xGSz|  246| 123.0|
+---+----------+-----+------+
only showing top 5 rows



defined [32mclass[39m [36mUsageCost[39m
defined [32mfunction[39m [36mcomputeUserCostUsage[39m