### PySpark map function(del tutorial [pyspark.RDD.map](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.map))

In [1]:
# map
# sc = spark context, parallelize crear un RDD a partir de la lista pasada por parametro
x = sc.parallelize([1,2,3])
y = x.map(lambda x: (x,x**2))
# collect copia los elementos de la RDD a una lista
print(x.collect())  
print(y.collect())

[1, 2, 3]
[(1, 1), (2, 4), (3, 9)]


### PySpark flatMap function(del tutorial [pyspark.RDD.flatMap](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.flatMap))

In [4]:
# flatMap
# Devuelve un nuevo RDD aplicando primero una función a todos los elementos de este RDD
x = sc.parallelize([1,2,3])
y = x.flatMap(lambda x: (x, 100*x, x**2))
print(x.collect())
print(y.collect())

[1, 2, 3]
[1, 100, 1, 2, 200, 4, 3, 300, 9]


### PySpark mapPartitions function(del tutorial [pyspark.RDD.mapPartitions](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.mapPartitions))

In [5]:
# mapPartitions
# Devuevle un nuevo RDD aplicando una función a cada partición de este RDD.

x = sc.parallelize([1,2,3], 2)
def f(iterator): yield sum(iterator)
y = x.mapPartitions(f)
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3]]
[[1], [5]]


### PySpark mapPartitionsWithIndex function(del tutorial [pyspark.RDD.mapPartitionsWithIndex](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.mapPartitionsWithIndex))

In [6]:
# mapPartitionsWithIndex
# Devuelve un nuevo RDD mediante la aplicación de una función a cada partición de este RDD, 
# mientras que el seguimiento del índice de la partición original.
x = sc.parallelize([1,2,3], 2)
def f(partitionIndex, iterator): yield (partitionIndex,sum(iterator))
y = x.mapPartitionsWithIndex(f)
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3]]
[[(0, 1)], [(1, 5)]]


### PySpark getNumPartitions function(del tutorial [pyspark.RDD.getNumPartitions](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.getNumPartitions))

In [9]:
# getNumPartitions
# Devuelve el número de particiones del RDD
x = sc.parallelize([1,2,3], 2)
y = x.getNumPartitions()
print(x.glom().collect())
print(y)

[[1], [2, 3]]
2


### PySpark filter function(del tutorial [pyspark.RDD.filter](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.filter))

In [10]:
# filter
# Devuelve un nuevo RDD que contiene sólo los elementos que satisfacen una condición
x = sc.parallelize([1,2,3])
y = x.filter(lambda x: x%2 == 1)  # filtrar elementos impares
print(x.collect())
print(y.collect())

[1, 2, 3]
[1, 3]


### PySpark distinct function(del tutorial [pyspark.RDD.distinct](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.distinct))

In [11]:
# distinct
# Devuelve un nuevo RDD que contiene los elementos distintos en este RDD.
x = sc.parallelize(['A','A','B'])
y = x.distinct()
print(x.collect())
print(y.collect())

['A', 'A', 'B']
['B', 'A']


### PySpark union function(del tutorial [pyspark.RDD.union](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.union))

In [12]:
# union
# Devuelve la unión entre 2 RDD
x = sc.parallelize(['A','A','B'])
y = sc.parallelize(['D','C','A'])
z = x.union(y)
print(x.collect())
print(y.collect())
print(z.collect())

['A', 'A', 'B']
['D', 'C', 'A']
['A', 'A', 'B', 'D', 'C', 'A']


### PySpark intersection function(del tutorial [pyspark.RDD.intersection](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.intersection))

In [14]:
# intersection
# Devuelve la intersección(elementos que están en ambos conjuntos) entre 2 RDD sin elementos duplicados
x = sc.parallelize(['A','A','B'])
y = sc.parallelize(['A','C','D'])
z = x.intersection(y)
print(x.collect())
print(y.collect())
print(z.collect())

['A', 'A', 'B']
['A', 'C', 'D']
['A']


### PySpark sortByKey function(del tutorial [pyspark.RDD.sortByKey](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.sortByKey))

In [15]:
# sortByKey
x = sc.parallelize([('B',1),('A',2),('C',3)])
y = x.sortByKey()
print(x.collect())
print(y.collect())

[('B', 1), ('A', 2), ('C', 3)]
[('A', 2), ('B', 1), ('C', 3)]


### PySpark sortBy function(del tutorial [pyspark.RDD.sortBy](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.sortBy))

In [16]:
# sortBy
x = sc.parallelize(['Cat','Apple','Bat'])
def keyGen(val): return val[0]
y = x.sortBy(keyGen)
print(y.collect())

['Apple', 'Bat', 'Cat']


### PySpark glom function(del tutorial [pyspark.RDD.glom](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.glom))

In [17]:
# glom
x = sc.parallelize(['C','B','A'], 2)
y = x.glom()
print(x.collect()) 
print(y.collect())

['C', 'B', 'A']
[['C'], ['B', 'A']]


### PySpark cartesian function(del tutorial [pyspark.RDD.cartesian](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.cartesian))

In [18]:
# cartesian
# Devuelve el producto cartesiano entre 2 RDD
x = sc.parallelize(['A','B'])
y = sc.parallelize(['C','D'])
z = x.cartesian(y)
print(x.collect())
print(y.collect())
print(z.collect())

['A', 'B']
['C', 'D']
[('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D')]


### PySpark groupBy function(del tutorial [pyspark.RDD.groupBy](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.groupBy))

In [19]:
# groupBy
# Devuelve un RDD de los elementos agrupados.
x = sc.parallelize([1,2,3])
y = x.groupBy(lambda x: 'A' if (x%2 == 1) else 'B' )
print(x.collect())
print([(j[0],[i for i in j[1]]) for j in y.collect()]) # y is nested, this iterates through it

[1, 2, 3]
[('B', [2]), ('A', [1, 3])]


### PySpark collect function(del tutorial [pyspark.RDD.collect](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.collect))

In [20]:
# collect
# Devuelve una lista que contiene todos los elementos del RDD
x = sc.parallelize([1,2,3])
y = x.collect()
print(x)  # distributed
print(y)  # not distributed

ParallelCollectionRDD[79] at parallelize at PythonRDD.scala:475
[1, 2, 3]


### PySpark reduce function(del tutorial [pyspark.RDD.reduce](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.reduce))

In [22]:
# reduce
x = sc.parallelize([1,2,3])
y = x.reduce(lambda obj, accumulated: obj + accumulated)  # sumatorio
print(x.collect())
print(y)

[1, 2, 3]
6


### PySpark max function(del tutorial [pyspark.RDD.max](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.max))

In [24]:
# max
# Devuelve el ítem máximo en este RDD
x = sc.parallelize([1,3,2])
y = x.max()
print(x.collect())
print(y)

[1, 3, 2]
3


### PySpark min function(del tutorial [pyspark.RDD.min](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.min))

In [25]:
# min
x = sc.parallelize([1,3,2])
y = x.min()
print(x.collect())
print(y)

[1, 3, 2]
1


### PySpark sum function(del tutorial [pyspark.RDD.sum](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.sum))

In [26]:
# sum
x = sc.parallelize([1,3,2])
y = x.sum()
print(x.collect())
print(y)

[1, 3, 2]
6


### PySpark count function(del tutorial [pyspark.RDD.count](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.count))

In [27]:
# count
x = sc.parallelize([1,3,2])
y = x.count()
print(x.collect())
print(y)

[1, 3, 2]
3


### PySpark mean function(del tutorial [pyspark.RDD.mean](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.mean))

In [31]:
# mean
x = sc.parallelize([1,3,2])
y = x.mean()
print(x.collect())
print(y)

[1, 3, 2]
2.0


### PySpark countByValue function(del tutorial [pyspark.RDD.countByValue](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.countByValue))

In [32]:
# countByValue
# Devuelve el recuento de cada valor único en este RDD como un diccionario de pares (clave,valor)
x = sc.parallelize([1,3,1,2,3])
y = x.countByValue()
print(x.collect())
print(y)

[1, 3, 1, 2, 3]
defaultdict(<type 'int'>, {1: 2, 2: 1, 3: 2})


### PySpark top function(del tutorial [pyspark.RDD.top](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.top))

In [33]:
# top
x = sc.parallelize([1,3,1,2,3])
y = x.top(num = 3)
print(x.collect())
print(y)

[1, 3, 1, 2, 3]
[3, 3, 2]


### PySpark takeOrdered function(del tutorial [pyspark.RDD.takeOrdered](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.takeOrdered))

In [35]:
# takeOrdered
x = sc.parallelize([1,3,1,2,3])
y = x.takeOrdered(num = 2)
print(x.collect())
print(y)

[1, 3, 1, 2, 3]
[1, 1]


### PySpark take function(del tutorial [pyspark.RDD.take](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.take))

In [36]:
# take
x = sc.parallelize([1,3,1,2,3])
y = x.take(num = 3)
print(x.collect())
print(y)

[1, 3, 1, 2, 3]
[1, 3, 1]


### PySpark first function(del tutorial [pyspark.RDD.first](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.first))

In [37]:
# first
x = sc.parallelize([1,3,1,2,3])
y = x.first()
print(x.collect())
print(y)

[1, 3, 1, 2, 3]
1


### PySpark collectAsMap function(del tutorial [pyspark.RDD.collectAsMap](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.collectAsMap))

In [38]:
# collectAsMap
x = sc.parallelize([('C',3),('A',1),('B',2)])
y = x.collectAsMap()
print(x.collect())
print(y)

[('C', 3), ('A', 1), ('B', 2)]
{'A': 1, 'C': 3, 'B': 2}


### PySpark keys function(del tutorial [pyspark.RDD.keys](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.keys))

In [39]:
# keys
x = sc.parallelize([('C',3),('A',1),('B',2)])
y = x.keys()
print(x.collect())
print(y.collect())

[('C', 3), ('A', 1), ('B', 2)]
['C', 'A', 'B']


### PySpark values function(del tutorial [pyspark.RDD.values](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.values))

In [40]:
# values
x = sc.parallelize([('C',3),('A',1),('B',2)])
y = x.values()
print(x.collect())
print(y.collect())

[('C', 3), ('A', 1), ('B', 2)]
[3, 1, 2]


### PySpark reduceByKey function(del tutorial [pyspark.RDD.reduceByKey](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.reduceByKey))

In [41]:
# reduceByKey
x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
y = x.reduceByKey(lambda agg, obj: agg + obj)
print(x.collect())
print(y.collect())

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
[('B', 3), ('A', 12)]


### PySpark countByKey function(del tutorial [pyspark.RDD.countByKey](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.countByKey))

In [42]:
# countByKey
x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
y = x.countByKey()
print(x.collect())
print(y)

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
defaultdict(<type 'int'>, {'A': 3, 'B': 2})


### PySpark join function(del tutorial [pyspark.RDD.join](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.join))

In [43]:
# join
x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])
z = x.join(y)
print(x.collect())
print(y.collect())
print(z.collect())

[('C', 4), ('B', 3), ('A', 2), ('A', 1)]
[('A', 8), ('B', 7), ('A', 6), ('D', 5)]
[('B', (3, 7)), ('A', (2, 8)), ('A', (2, 6)), ('A', (1, 8)), ('A', (1, 6))]


### PySpark leftOuterJoin function(del tutorial [pyspark.RDD.leftOuterJoin](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.leftOuterJoin))

In [44]:
# leftOuterJoin
x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])
z = x.leftOuterJoin(y)
print(x.collect())
print(y.collect())
print(z.collect())

[('C', 4), ('B', 3), ('A', 2), ('A', 1)]
[('A', 8), ('B', 7), ('A', 6), ('D', 5)]
[('B', (3, 7)), ('A', (2, 8)), ('A', (2, 6)), ('A', (1, 8)), ('A', (1, 6)), ('C', (4, None))]


### PySpark rightOuterJoin function(del tutorial [pyspark.RDD.rightOuterJoin](http//spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.rightOuterJoin))

In [45]:
# rightOuterJoin
x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])
z = x.rightOuterJoin(y)
print(x.collect())
print(y.collect())
print(z.collect())

[('C', 4), ('B', 3), ('A', 2), ('A', 1)]
[('A', 8), ('B', 7), ('A', 6), ('D', 5)]
[('B', (3, 7)), ('D', (None, 5)), ('A', (2, 8)), ('A', (2, 6)), ('A', (1, 8)), ('A', (1, 6))]


### PySpark partitionBy function(del tutorial [pyspark.RDD.partitionBy](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.partitionBy))

In [47]:
# partitionBy
x = sc.parallelize([(0,1),(1,2),(2,3)],2)
y = x.partitionBy(numPartitions = 2, partitionFunc = lambda x: x)  # only key is passed to paritionFunc
print(x.glom().collect())
print(y.glom().collect())

[[(0, 1)], [(1, 2), (2, 3)]]
[[(0, 1), (2, 3)], [(1, 2)]]


### PySpark foldByKey function(del tutorial [pyspark.RDD.foldByKey](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.foldByKey))

In [48]:
# foldByKey
x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
zeroValue = 1 # one is 'zero value' for multiplication
y = x.foldByKey(zeroValue,lambda agg,x: agg*x )  # computes cumulative product within each key
print(x.collect())
print(y.collect())

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
[('B', 2), ('A', 60)]


### PySpark groupByKey function(del tutorial [pyspark.RDD.groupByKey](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.groupByKey))

In [49]:
# groupByKey
x = sc.parallelize([('B',5),('B',4),('A',3),('A',2),('A',1)])
y = x.groupByKey()
print(x.collect())
print([(j[0],[i for i in j[1]]) for j in y.collect()])

[('B', 5), ('B', 4), ('A', 3), ('A', 2), ('A', 1)]
[('B', [5, 4]), ('A', [3, 2, 1])]


### PySpark flatMapValues function(del tutorial [pyspark.RDD.flatMapValues](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.flatMapValues))

In [50]:
# flatMapValues
x = sc.parallelize([('A',(1,2,3)),('B',(4,5))])
y = x.flatMapValues(lambda x: [i**2 for i in x]) # function is applied to entire value, then result is flattened
print(x.collect())
print(y.collect())

[('A', (1, 2, 3)), ('B', (4, 5))]
[('A', 1), ('A', 4), ('A', 9), ('B', 16), ('B', 25)]


### PySpark mapValues function(del tutorial [pyspark.RDD.mapValues](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.mapValues))

In [51]:
# mapValues
x = sc.parallelize([('A',(1,2,3)),('B',(4,5))])
y = x.mapValues(lambda x: [i**2 for i in x]) # function is applied to entire value
print(x.collect())
print(y.collect())

[('A', (1, 2, 3)), ('B', (4, 5))]
[('A', [1, 4, 9]), ('B', [16, 25])]


### PySpark keyBy function(del tutorial [pyspark.RDD.keyBy](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.keyBy))

In [54]:
# keyBy
x = sc.parallelize([1,2,3])
y = x.keyBy(lambda x: x**2)
print(x.collect())
print(y.collect())

[1, 2, 3]
[(1, 1), (4, 2), (9, 3)]


### PySpark repartition function(del tutorial [pyspark.RDD.repartition](http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.repartition))

In [59]:
# repartition
x = sc.parallelize([1,2,3,4,5],3)
y = x.repartition(numPartitions=3)
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3], [4, 5]]
[[], [1, 2, 3], [4, 5]]
