In [0]:
# create simple RDD 
spark.range(10).rdd


In [0]:
# convert this Row-type object to the correct data type or extract values out of it
spark.range(10).toDF("id").rdd.map(lambda row: row[0])


In [0]:
# create a DataFrame or Dataset from an RDD with the toDF method on the RDD
display(spark.range(10).rdd.toDF())


id
0
1
2
3
4
5
6
7
8
9


To create an RDD from a collection
-  you will need to use the parallelize method on a SparkContext (within a SparkSession) - This turns a single node collection into a parallel collection.
-  creating this parallel collection, you can also explicitly state the number of partitions into which you would like to distribute this array.

In [0]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)



In [0]:
# name this RDD to show up in the Spark UI accordingto a given name
words.setName("myWords")
words.name() # myWords



In [0]:
 spark.sparkContext.textFile("/FileStore/tables/sample_libsvm_data.txt")


In [0]:
spark.sparkContext.wholeTextFiles("/FileStore/tables/sample_libsvm_data.txt")

In [0]:
words.distinct().count()

In [0]:
def startsWithS(individual):
  return individual.startswith("S")


print(startsWithS("Studious"))


In [0]:
# run on each member of a collection
words.filter(lambda word: startsWithS(word)).collect()


In [0]:
# Map the current word to the word, its starting letter, and whether the word begins with “S" - defining our functions completely inline using the relevant lambda syntax

words2 = words.map(lambda word: (word, word[0], word.startswith("S")))



In [0]:
#  filter on this by selecting the relevant Boolean value in a new function

words2.filter(lambda record: record[2]).take(5)


In [0]:
# flatMap requires that the ouput of the map function be an iterable that can be expanded

words.flatMap(lambda word: list(word)).take(5)


In [0]:
# sorts by word length from longest to shortest:

words.sortBy(lambda word: len(word) * -1).take(10)


In [0]:
fiftyFiftySplit = words.randomSplit([0.5, 0.5])


In [0]:
# reduce this to its sum by specifying a function that takes as input two values and reduces them into one

spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x, y: x + y) 

In [0]:
# use function to reduce to single value 
# This reducer is a good example because you can get one of two outputs. 
# Because the reduce operation on the partitions is not deterministic, you can have either “definitive” or “processing”(both of length 10) as the “left” word. 
# This means that sometimes you can end up with one, whereas other times you end up with the other


def wordLengthReducer(leftWord, rightWord):
  if len(leftWord) > len(rightWord):
    return leftWord
  else:
    return rightWord

words.reduce(wordLengthReducer)


In [0]:
words.count()

countApprox
- an approximation of the count method  - but it must execute within a timeout 
- can return incomplete results if it exceeds the timeout
- The confidence is the probability that the error bounds of the result will contain the true value
- if countApprox were called repeatedly with confidence 0.9, we would expect 90% of the results to contain the true count 
- The confidence must be in the range [0,1], or an exception will be thrown

In [0]:


confidence=0.95
timeoutMilliseconds=400
words.countApprox(timeoutMilliseconds,confidence)

In [0]:
words.countApproxDistinct(0.05)

In [0]:
words.countApproxDistinct(4,10)

In [0]:
words.countByValue()

In [0]:
words.countByValueApprox(1000,0.95)

In [0]:
spark.sparkContext.parallelize(1 to 20).max()
spark.sparkContext.parallelize(1 to 20).min()

In [0]:
words.take(5)
words.takeOrdered(5)
words.top(5)
withReplacement=True
numberToTake=6
randomSeed=100
words.takeSample(withReplacement,numberToTake,randomSeed)

In [0]:
words.getStorageLevel()


mapPartitions
- The previous command revealed that Spark operates on a per-partition basis when it comes to actually executing code
-  You also might have noticed earlier that the return signature of a map function on an RDD is actually MapPartitionsRDD
- This is because map is just a row-wise alias for mapPartitions, which makes it possible for you to map an individual partition - represented as an iterator
-  That’s because physically on the cluster we operate on each partition individually - not a specific row
- Needs a return value to work properly
 
 
 A simple example creates the value “1” for every partition in our data, and the sum of the following expression will count the number of partitions we have:

In [0]:
words.mapPartitions(lambda part: [1]).sum() # 2


foreachPartition
-  iterates over all the partitions of the data - but the function has no return value
- great for doing something with each partition like writing it out to a database
- This is how many data source connectors are written

Can create our own text file source if you want by specifying outputs to the temp directory with a random ID:

In [0]:
def indexedFunc(partitionIndex, withinPartIterator):
  return ["partition: {} => {}".format(partitionIndex,
    x) for x in withinPartIterator]
words.mapPartitionsWithIndex(indexedFunc).collect()


glom
-  takes every partition in your dataset and converts them to arrays
- can be useful if you’re going to collect the data to the driver and want to have an array for each partition 
- can cause serious stability issues because if you have large partitions or a large number of partitions
- easy to crash the driver


In the following example, you can see that we get two partitions and each word falls into one partition each:

In [0]:
spark.sparkContext.parallelize(["Hello", "World"], 2).glom().collect()
# [['Hello'], ['World']]
