In [None]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext("local", "transformation 1")

### groupBy(f, numPartitions=None, partitionFunc="function portable_hash")

Return an RDD of grouped items.

In [None]:
data = sc.parallelize ([1, 1, 2, 3, 4, 5, 5, 6, 6, 6])
result = data.groupBy(lambda x: x%3).mapValues(list).collect()
print(result)

### groupByKey(numPartitions=None, partitionFunc="function portable_hash")

Group the values for each key in the RDD into a single sequence. Hash-partitions the resulting RDD with numPartitions partitions.

Note If you are grouping in order to perform an aggregation (such as a sum or average) over each key, using reduceByKey or aggregateByKey will provide much better performance.

In [None]:
data = sc.parallelize([("a", 1), ("b", 2), ("c", 3), ("a", 3)])
print(data.groupByKey().mapValues(len).collect())

### GroupByKey vs. ReduceByKey

In [None]:
data = sc.parallelize(range(10)).map(lambda a: (a%3, a))
data.reduceByKey(lambda a, b: a+b).collect()

In [None]:
data.groupByKey().map(lambda t: (t[0], sum(t[1]))).collect()

### sortByKey(ascending=True, numPartitions=None, keyfunc="lambda function")

Sorts this RDD, which is assumed to consist of (key, value) pairs.

In [None]:
data = sc.parallelize([("b", 1), ("a", 1), ("c", 1), ("a", 1)])
data.sortByKey(False).collect()

### sortBy(keyfunc, ascending=True, numPartitions=None)
Sorts this RDD by the given keyfunc

In [None]:
data = sc.parallelize([4, 3, 2, 4, 2, 7, 9, 4, 5, 2])
data.sortBy(lambda x: x, False).collect()

### coalesce(numPartitions, shuffle=False)
Return a new RDD that is reduced into numPartitions partitions.

In [None]:
data = sc.parallelize(range(15), 4)
data.coalesce(10).glom().collect()

### repartition(numPartitions)
Return a new RDD that has exactly numPartitions partitions.

Can increase or decrease the level of parallelism in this RDD. Internally, this uses a shuffle to redistribute data. ***If you are decreasing the number of partitions in this RDD, consider using coalesce, which can avoid performing a shuffle.***

In [None]:
data.repartition(2).glom().collect()

### sample(withReplacement, fraction, seed=None)
Return a sampled subset of this RDD.

Parameters
* withReplacement – can elements be sampled multiple times (replaced when sampled out)
* fraction – expected size of the sample as a fraction of this RDD’s size without replacement: probability that each element is chosen; fraction must be \[0, 1\] with replacement: expected number of times each element is chosen; fraction must be >= 0
* seed – seed for the random number generator

In [None]:
data.sample(True, 0.5, 13).collect()

### distinct(numPartitions=None)
Return a new RDD containing the distinct elements in this RDD.

In [None]:
data = sc.parallelize([1, 1, 2, 3, 4, 5, 6, 2, 3, 5])
data.distinct().collect()