In [3]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="Lab1")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/15 12:33:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Spark - Pair RDD - Basic Key/Value Transformations

## Working with Key/Value Pairs

Spark provides special operations on RDDs containing key/value pairs. These RDDs
are called pair RDDs

Table 4-1. Transformations on one pair RDD (example: [(1, 2), (3, 4), (3, 6)])

In [4]:
rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])

In [5]:
rdd.collect()

[(1, 2), (3, 4), (3, 6)]

In [7]:
python_dict = {'lo que quiera, pero tipo escalar de python':['lo','que','quiera']}

In [8]:
python_dict

{'lo que quiera, pero tipo escalar de python': ['lo', 'que', 'quiera']}

In [9]:
('lo que quiera, pero tipo escalar de python',['lo','que','quiera'])

('lo que quiera, pero tipo escalar de python', ['lo', 'que', 'quiera'])

In [10]:
rdd2 = sc.parallelize([('lo que quiera, pero tipo escalar de python',['lo','que','quiera'])])

In [11]:
rdd2.collect()

[('lo que quiera, pero tipo escalar de python', ['lo', 'que', 'quiera'])]

---

In [12]:
python_dict2 = {[1,2,3]:['lo','que','quiera']}

TypeError: unhashable type: 'list'

In [13]:
rdd3 = sc.parallelize([([1,2,3],['lo','que','quiera'])])

In [14]:
rdd3.collect()

[([1, 2, 3], ['lo', 'que', 'quiera'])]

In [15]:
rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])

In [16]:
rdd.collect()

[(1, 2), (3, 4), (3, 6)]

## reduceByKey(func) 

Combine values with the same key. rdd.reduceByKey((x, y) => x + y) 

[(1,2), (3, 10)]


In [17]:
rdd.reduceByKey(lambda x, y : x + y).collect()

                                                                                

[(1, 2), (3, 10)]

## countByKey()

In [18]:
rdd.countByKey()

defaultdict(int, {1: 1, 3: 2})

## countByValue()

In [19]:
rdd.countByValue()

defaultdict(int, {(1, 2): 1, (3, 4): 1, (3, 6): 1})

## groupByKey()

Group values with thesame key.
rdd.groupByKey() 

[(1,[2]),(3, [4,6])]

In [20]:
rdd_iterator = rdd.groupByKey()

In [21]:
rdd_iterator

PythonRDD[15] at RDD at PythonRDD.scala:53

In [22]:
rdd_iterator.collect()

[(1, <pyspark.resultiterable.ResultIterable at 0x7f168c2710d0>),
 (3, <pyspark.resultiterable.ResultIterable at 0x7f168c2713d0>)]

What you're getting back is an object which allows you to iterate over the results. You can turn the results of groupByKey into a list by calling list() on the values, e.g.

In [23]:
rdd.groupByKey().map(lambda x : (x[0], list(x[1]))).collect()

[(1, [2]), (3, [4, 6])]

In [24]:
rdd.groupByKey().map(lambda x : (x[0], min(x[1]))).collect()

[(1, 2), (3, 4)]

In [25]:
rdd.groupByKey().map(lambda x : (x[0], max(x[1]))).collect()

[(1, 2), (3, 6)]

# Ejercicio - Como hacer que la desviación standard se lo calcule Spark.

In [26]:
import numpy as np

In [27]:
rdd.groupByKey().map(lambda x : (x[0], np.std(list(x[1])))).collect()

[(1, 0.0), (3, 1.0)]

---

Another option is to use: mapValues

In [28]:
rdd.groupByKey().mapValues(list).collect()

[(1, [2]), (3, [4, 6])]

In [29]:
rdd.groupByKey().mapValues(list).map(lambda x: (x[0],sum(x[1]))).collect()

[(1, 2), (3, 10)]

## combineBy

Key(createCombiner,mergeValue,mergeCombiners, partitioner)

Combine values with the same key using a different result type.



In [7]:
rdd.combineByKey((lambda x: (x,1)),
 (lambda x, y: (x[0] + y, x[1] + 1)),
 (lambda x, y: (x[0] + y[0], x[1] + y[1]))).collect()

                                                                                

[(1, (2, 1)), (3, (10, 2))]

In [12]:
rdd.combineByKey((lambda x: (x,1)),
 (lambda x, y: (x, y)),
 (lambda x, y: (x, y))).collect()

[(1, (2, 1)), (3, ((4, 1), (6, 1)))]

In [13]:
rdd.combineByKey((lambda x: (x,1)),
 (lambda x, y: (x[0] + y, x[1] + 1)),
 (lambda x, y: (x, y))).collect()

[(1, (2, 1)), (3, ((4, 1), (6, 1)))]

In [14]:
rdd.combineByKey((lambda x: (x,1)),
 (lambda x, y: (x[0] + y, x[1] + 1)),
 (lambda x, y: (x[0] + y[0], x[1] + y[1]))).collect()

[(1, (2, 1)), (3, (10, 2))]

## mapValues(func) 
Apply a function to each value of a pair RDD without changing the key.

rdd.mapValues(x => x+1) 

[(1, 3), (3,5), (3,7)]

In [31]:
rdd.collect()

[(1, 2), (3, 4), (3, 6)]

In [32]:
rdd.mapValues(lambda x : x+1).collect()

[(1, 3), (3, 5), (3, 7)]

## flatMapValues(func) 

Apply a function that returns an iterator to each value of a pair RDD, and for each element returned, produce a key/value entry with the old key. Often used for tokenization.

### flatMap & flatMapValues explained in example


In [33]:
rdd1 = sc.parallelize([2, 3, 4])
rdd1.map(lambda x: (1, x)).collect()

[(1, 2), (1, 3), (1, 4)]

In [34]:
rdd1.flatMap(lambda x: range(1, x)).collect()

[1, 1, 2, 1, 2, 3]

In [35]:
rdd2 = sc.parallelize([("a", ["x", "y", "z"]), ("b", ["p", "r"])])
rdd2.collect()

[('a', ['x', 'y', 'z']), ('b', ['p', 'r'])]

In [36]:
rdd2.flatMapValues(lambda value:value).collect()

[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]

es lo contrario de: rdd.groupByKey().mapValues(list).collect()

## keys() 

Return an RDD of just the keys.

rdd.keys() 

[1, 3,3]

In [37]:
rdd.keys().collect()

[1, 3, 3]

## values() 

Return an RDD of just the values.

rdd.values() 

[2, 4,6]


In [38]:
rdd.values().collect()

[2, 4, 6]

## sortByKey() 

Return an RDD sorted by the key.

rdd.sortByKey() 

[(1,2), (3,4), (3,6)]

In [39]:
rdd.sortByKey().collect()

[(1, 2), (3, 4), (3, 6)]

Sort by key with descending order

In [40]:
rdd.sortByKey(False).collect()

[(3, 4), (3, 6), (1, 2)]

---

##  Transformations on two pair RDDs (rdd = [(1, 2), (3, 4), (3, 6)] other = [(3, 9)])

In [41]:
rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])

In [42]:
other = sc.parallelize([(3, 9)])

In [43]:
rdd.collect()

[(1, 2), (3, 4), (3, 6)]

In [44]:
other.collect()

[(3, 9)]

<img src="SQL.png" align="center">

## subtractByKey - SQL: Left Excluding Join

Remove elements with a key present in the other RDD.

rdd.subtractByKey(other) 

[(1, 2)]

In [45]:
rdd.subtractByKey(other).collect()

[(1, 2)]

## join - SQL - Inner Join

Perform an inner join between two RDDs.

rdd.join(other) 

[(3, (4, 9)), (3,(6, 9))]

In [46]:
rdd.join(other).collect()

[(3, (4, 9)), (3, (6, 9))]

## rightOuterJoin 

Perform a join between two RDDs where the key must be present in the first RDD.


rdd.rightOuterJoin(other) 

[(3, (4, 9)), (3, (6, 9))]


In [47]:
rdd.rightOuterJoin(other).collect()

[(3, (4, 9)), (3, (6, 9))]

## leftOuterJoin 

Perform a join between two RDDs where the key must be present in the other RDD.

rdd.leftOuterJoin(other) 

[(1, (2, None)), (3, (4, 9)), (3, (6, 9))]


In [48]:
rdd.leftOuterJoin(other).collect()

[(1, (2, None)), (3, (4, 9)), (3, (6, 9))]

---

# Spark - Pair RDD - Reduce Actions 

## Aggregation

When datasets are described in terms of key/value pairs, it is common to want to aggregate statistics across all elements with the same key. We have looked at the fold(), combine(), and reduce() actions on basic RDDs, and similar per-key transformations exist on pair RDDs. Spark has a similar set of operations that combines values that have the same key. These operations return RDDs and thus are transformations rather than actions.


## reduceByKey

### Per-key average with reduceByKey() and mapValues() in Python

In [49]:
rdd = sc.parallelize([('a', 2), ('c', 4), ('c', 6)])
other = sc.parallelize([('c', 9)])

In [50]:
rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).collect()

[('a', (2, 1)), ('c', (10, 2))]

transformations - map (individual para cada fila)  / reduce (verticalmente entre filas - en toda la columna)  / action (consolida la respuesta en un objecto de Python , deja de ser algo de Spark)

In [51]:
rdd.collect()

[('a', 2), ('c', 4), ('c', 6)]

In [52]:
rdd.mapValues(lambda x: (x,1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])). \
mapValues(lambda x: x[0]/x[1]).collect()

[('a', 2.0), ('c', 5.0)]

### Word count in Python

In [21]:
lines = sc.parallelize(["linea 1 Python","linea 2 Python","linea 3 Spark"] )

In [23]:
lines.take(10)

['linea 1 Python', 'linea 2 Python', 'linea 3 Spark']

In [24]:
lines.flatMap(lambda x: x.split(" ")).collect()

['linea', '1', 'Python', 'linea', '2', 'Python', 'linea', '3', 'Spark']

In [25]:
lines.map(lambda x: x.split(" ")).collect()

[['linea', '1', 'Python'], ['linea', '2', 'Python'], ['linea', '3', 'Spark']]

In [27]:
words = lines.flatMap(lambda x: x.split(" "))
words.collect()

['linea', '1', 'Python', 'linea', '2', 'Python', 'linea', '3', 'Spark']

In [31]:
help('a'.isnumeric)

Help on built-in function isnumeric:

isnumeric() method of builtins.str instance
    Return True if the string is a numeric string, False otherwise.
    
    A string is numeric if all characters in the string are numeric and there is at
    least one character in the string.



In [28]:
words.filter(lambda x: not x.isnumeric()).collect()

['linea', 'Python', 'linea', 'Python', 'linea', 'Spark']

In [42]:
words2 = words.filter(lambda x: not x.isnumeric()).map(lambda x: (x, 1))

In [43]:
type(words.filter(lambda x: not x.isnumeric()).map(lambda x: (x, 1)))

pyspark.rdd.PipelinedRDD

In [45]:
type(words2)

pyspark.rdd.PipelinedRDD

In [46]:
words2.reduceByKey(lambda x, y: x + y).collect()

[('Python', 2), ('Spark', 1), ('linea', 3)]

In [39]:
words.filter(lambda x: not x.isnumeric()). \
    map(lambda x: (x, 1)). \
    reduceByKey(lambda x, y: x + y). \
    collect()

[('Python', 2), ('Spark', 1), ('linea', 3)]

In [47]:
result = words.filter(lambda x: not x.isnumeric()). \
    map(lambda x: (x, 1)).\
    reduceByKey(lambda x, y: x + y)

In [48]:
result.collect()

[('Python', 2), ('Spark', 1), ('linea', 3)]

## combineByKey() 

combineByKey() is the most general of the per-key aggregation functions. 

### Per-key average using combineByKey() in Python

In [77]:
rdd = sc.parallelize([('a', 2), ('c', 4), ('c', 6)])

In [78]:
rdd.collect()

[('a', 2), ('c', 4), ('c', 6)]

In [79]:
rdd.map(lambda x: (x, 1)).collect()

[(('a', 2), 1), (('c', 4), 1), (('c', 6), 1)]

(LLAVE, VALOR, UNO-INICIALIZADOR)


rdd.combineByKey(INICIALIZALOR - (VALOR_ORIGINAL, 1),
 FUNCIÓN_DE_COMBINACIÓN_DE_FILAS_DENTRO_DE_UNA_MISMA_PARTICIÓN,
 FUNCIÓN_DE_MERGE_COMO_FUSIONAR_PARTICIONES)

In [83]:
sumCount = rdd.combineByKey((lambda x: (x,1)),
 (lambda x, y: (x[0] + y, x[1] + 1)),
 (lambda x, y: (x[0] + y[0], x[1] + y[1])))


In [84]:
sumCount.collect()

[('c', (10, 2)), ('a', (2, 1))]

In [89]:
sumCount.map(lambda keyValue: keyValue[1][1]).collect()

[2, 1]

In [90]:
sumCount.map(lambda keyValue: (keyValue[0],keyValue[1][0]/keyValue[1][1])).collect()

[('c', 5.0), ('a', 2.0)]

In [91]:
rdd.combineByKey((lambda x: (x,1)),
 (lambda x, y: (x[0] + y, x[1] + 1)),
 (lambda x, y: (x[0] + y[0], x[1] + y[1]))). \
 map(lambda keyValue: (keyValue[0],keyValue[1][0]/keyValue[1][1])).collect()

[('c', 5.0), ('a', 2.0)]

### reduceByKey() with custom parallelism in Python

When performing aggregations or grouping operations, we can ask Spark to use a specific number of partitions. Spark will always try to __infer a sensible default value__ based on the size of your cluster, but in some cases __you will want to tune__ the level of parallelism for better performance.

In [49]:
data = [("a", 3), ("b", 4), ("a", 1)]

### Default parallelism

In [50]:
sc.parallelize(data).reduceByKey(lambda x, y: x + y).collect()

[('b', 4), ('a', 4)]

### Custom parallelism

In [51]:
sc.parallelize(data).reduceByKey(lambda x, y: x + y, 10).collect()

[('b', 4), ('a', 4)]

---

# Actions Available on Pair RDDs

## countByKey() 

Count the number of elements for each key.

rdd.countByKey() 

[(1, 1), (3, 2)]


In [55]:
count_by_key_dict = rdd.countByKey()

In [56]:
count_by_key_dict.keys()

dict_keys(['a', 'c'])

In [57]:
for key in count_by_key_dict.keys():
    print("key={}, value={}".format(key,count_by_key_dict[key]))

key=a, value=1
key=c, value=2


In [61]:
for key, value in count_by_key_dict.items():
    print("key={}, value={}".format(key,value))

key=a, value=1
key=c, value=2


## collectAsMap() 

Collect the result as a map to provide easy lookup.

rdd.collectAsMap() 

Map[{'a': 2, 'c': 6}]

In [67]:
rdd.collectAsMap()

{'a': 2, 'c': 6}

In [65]:
rdd.collect()[0][1]

2

In [68]:
rdd.collectAsMap()['a']

2

## lookup(key) 

Return all values associated with the provided key.

rdd.lookup(3) 

[4, 6]

In [69]:
rdd.collect()

[('a', 2), ('c', 4), ('c', 6)]

In [71]:
rdd.lookup('c') 

[4, 6]

In [73]:
type(rdd.lookup('a'))

list

---

## Exercise: filter keys which are associated to at least 2 values - mantener las llaves que tienen por lo menos 2 repeticiones, borrar las que no se repiten.

In [74]:
my_rdd = sc.parallelize([(u'key1', u'1'), 
                         (u'key2', u'1'), 
                         (u'key1', u'2'), 
                         (u'key2', u'3'), 
                         (u'key4', u'1'), 
                         (u'key1', u'4'), 
                         (u'key4', u'1'), 
                         (u'key6', u'2'), 
                         (u'key7', u'4'), 
                         (u'key8', u'5'), 
                         (u'key9', u'6'), 
                         (u'key10', u'7')])

In [75]:
my_rdd.collect()

[('key1', '1'),
 ('key2', '1'),
 ('key1', '2'),
 ('key2', '3'),
 ('key4', '1'),
 ('key1', '4'),
 ('key4', '1'),
 ('key6', '2'),
 ('key7', '4'),
 ('key8', '5'),
 ('key9', '6'),
 ('key10', '7')]

In [55]:
# create a key-value RDD
rdd = sc.parallelize([(1, 2), (3, 4), (3, 6), (4, 5), (4, 2), (4, 10)])

# define the three accumulator functions
createCombiner = lambda x: (x, 1)
mergeValue = lambda acc, x: (acc[0] + x, acc[1] + 1)
mergeCombiners = lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])

# apply the combineByKey method
combined_rdd = rdd.combineByKey(createCombiner, mergeValue, mergeCombiners)

# print the result
print(combined_rdd.collect())


[(1, (2, 1)), (3, (10, 2)), (4, (17, 3))]


---