In [None]:
val myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")//String array 
val words = spark.sparkContext.parallelize(myCollection, 2)//To create an RDD from a collection, you will need to use the 
                                                 // parallelize method on a SparkContext (within a SparkSession).

In [None]:
val keyvalue=words.map(word => (word.toLowerCase, 1)) // from RDD to key-value RDD
//keyvalue = Array((spark,1), (the,1), (definitive,1), (guide,1), (:,1), (big,1), (data,1), (processing,1),...

#### keyBy 

In [None]:
val keyword = words.keyBy(word => word.toLowerCase.toSeq(0).toString)// a different way for key-value rdd
//keyword = Array((s,Spark), (t,The), (d,Definitive), (g,Guide), (:,:), (b,Big), (d,Data), (p,Processing),....

In [None]:
val keyMap=keyword.mapValues(word => word.toUpperCase).collect()
//(s,SPARK) (t,THE) (d,DEFINITIVE) (g,GUIDE) (:,:) (b,BIG) (d,DATA) (p,PROCESSING) .....

In [None]:
val keyFlatMap=keyword.flatMapValues(word => word.toUpperCase).collect()
//(s,S) (s,P) (s,A) (s,R) (s,K) (t,T) (t,H) (t,E) (d,D) (d,E) (d,F) (d,I) (d,N) (d,I) (d,T) (d,I)

In [None]:
keyword.keys.collect() //Array(s, t, d, g, :, b, d, p, m, s)

In [None]:
keyword.values.collect()//Array(Spark, The, Definitive, Guide, :, Big, Data, Processing, Made, Simple)

In [None]:
keyword.lookup("s")//WrappedArray(Spark, Simple)

#### Sample

In [None]:
val distinctChars = words.flatMap(word => word.toLowerCase.toSeq).distinct.collect()//Array(d, p, t, b, h, n, f, v, :, r

In [None]:
words.map(word => (word.toLowerCase.toSeq(0), word)).sampleByKey(true, sampleMap, 6L).collect()
//Array((s,Spark), (t,The), (d,Definitive), (g,Guide), (:,:))

#### Aggregations 

In [None]:
val chars = words.flatMap(word => word.toLowerCase.toSeq)//(s, p, a, r, k, t, h, e, d, e, f,....
val KVcharacters = chars.map(letter => (letter, 1))//(s,1), (p,1), (a,1), (r,1), (k,1), (t,1), (h,1), (e,1), ....

#### countByKey

In [None]:
val timeout = 1000L //milliseconds
val confidence = 0.95
KVcharacters.countByKey()//Map(e -> 7, s -> 4, n -> 2, t -> 3, u -> 1, f -> 1, a -> 4, m -> 2, i -> 7, v ..
KVcharacters.countByKeyApprox(timeout, confidence)//(final: Map(e -> [7.000, 7.000], s -> [4.000, 4.000], n -> [2.000, 2.000]..

#### groupByKey

In [None]:
def addFunc(left:Int, right:Int) = left + right
KVcharacters.groupByKey().map(row => (row._1, row._2.reduce(addFunc))).collect()//Array((d,4), (p,3), (t,3), (b,1)..
KVcharacters.reduceByKey(addFunc).collect() //Array((d,4), (p,3), (t,3), (b,1),

#### aggregate 

In [None]:
//aggregate function requires a null and start value and then requires you to specify two different functions. The first
//aggregate function(max in this case) executes within partitions, the second aggregate function(add in this case) executes
//on the first's result  in driver. (this second may cause outOofMemory).
val nums = sc.parallelize(1 to 30, 1)// Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
def maxFunc(left:Int, right:Int) = math.max(left, right)
def addFunc(left:Int, right:Int) = left + right
println(nums.aggregate(0)(maxFunc, addFunc))// if there is only one partition the second will have only one value to add
  //30                                           // so the result will be the mx value only.

#### treeAggregate 

In [None]:
//treeAggregate  does the same thing as aggregate (at the user level) but does so in a different way. It basically “pushes down” 
//some of the subaggregations (creating a tree from executor to executor) before performing the final aggregation on the driver. 
//Having multiple levels can help you to ensure that the driver does not run out of memory in the process of the aggregation.
//These tree-based implementations are often to try to improve stability in certain operations:
val depth = 3
nums.treeAggregate(0)(maxFunc, addFunc, depth)

#### aggregateByKey

In [None]:
KVcharacters.aggregateByKey(0)(addFunc, maxFunc).collect()// same as above for each key
//Array((d,2), (p,2), (t,2), (b,1), (h,1), (n,1), (f,1), (v,1), (:,1), (r,1), (l,1), (s,3)....

#### combineByKey

In [None]:
//KVcharacters=>(s,1), (p,1), (a,1), (r,1), (k,1), (t,1), (h,1), (e,1),
val valToCombiner = (value:Int) => List(value)// (value, 1) =>(key, (value, 1))
val mergeValuesFunc = (vals:List[Int], valToAppend:Int) => valToAppend :: vals // (key, (value, 1)) =>(key, (total, count)) in every partition
val mergeCombinerFunc = (vals1:List[Int], vals2:List[Int]) => vals1 ::: vals2//(key, (total, count)) => (key, (totalAcrossAllPartitions, countAcrossAllPartitions))
val outputPartitions = 6
KVcharacters.combineByKey(valToCombiner,mergeValuesFunc,mergeCombinerFunc,outputPartitions).collect()

#### foldByKey

In [None]:
KVcharacters.foldByKey(0)(addFunc).collect() //merges the values for each key with neutral “zero value,” 0 for +, or 1 for *
//Array((d,4), (p,3), (t,3), (b,1), (h,1), (n,2), (f,1), (v,1), (:,1),

In [None]:
mergeCombinerFunc(mergeValuesFunc(valToCombiner(10),5),mergeValuesFunc(valToCombiner(10),5))

In [None]:
CoGroups

In [None]:
//CoGroups give you the ability to group together up to three key–value RDDs together in Scala
import scala.util.Random
val distinctChars = words.flatMap(word => word.toLowerCase.toSeq).distinct
val charRDD = distinctChars.map(c => (c, new Random().nextDouble()))
val charRDD2 = distinctChars.map(c => (c, new Random().nextDouble()))
val charRDD3 = distinctChars.map(c => (c, new Random().nextDouble()))
charRDD.cogroup(charRDD2, charRDD3).take(5)

#### coalesce

In [None]:
//coalesce effectively collapses partitions on the same worker in order to avoid a shuffle of the data when repartitioning.
words.coalesce(1).getNumPartitions // 1

#### repartition

In [None]:
//The repartition operation allows you to repartition your data up or down but performs a shuffle across nodes in the process.
words.repartition(10) // gives us 10 partitions

#### repartitionAndSortWithinPartitions

In [None]:
This operation gives you the ability to repartition as well as specify the ordering of each one of those
output partitions.

#### Custom Partitioning

In [None]:
val df = spark.read
.option("header", "true")
.option("inferSchema", "true")
.csv("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/retail-data/all/")
val rdd = df.coalesce(10).rdd

In [None]:
import org.apache.spark.HashPartitioner
val keyedRDD = rdd.keyBy(row => row(6).asInstanceOf[Int].toDouble)
keyedRDD.partitionBy(new HashPartitioner(10)).take(10)
val groupRDD=keyedRDD.groupByKey()

In [None]:
rdd.map(r => r(0)).take(1).foreach(println) //groupByKey
keyedRDD.take(1).foreach(println)//(17850.0,[536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom])
groupRDD.take(5).foreach(println)//
println(keyedRDD.count)//541909
println(groupRDD.count)//4373

In [None]:
import org.apache.spark.Partitioner
class DomainPartitioner extends Partitioner {
def numPartitions = 3
def getPartition(key: Any): Int = {
val customerId = key.asInstanceOf[Double].toInt
if (customerId == 17850.0 || customerId == 12583.0 || customerId == 13927.0  || customerId == 12853.0) {
return 0
} else {
return new java.util.Random().nextInt(2) + 1
}
}
}

In [9]:
keyedRDD
.partitionBy(new DomainPartitioner).map(_._1).glom().map(_.toSet.toSeq.length)
.take(5)

Array(4, 4298, 4298)