# Spark - RDD - Basic Transformations

In [None]:
#!pip install findspark

In [2]:
import pyspark
sc = pyspark.SparkContext()

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at /tmp/ipykernel_21269/3076606176.py:2 

# Creando un RDD con 3 lineas de texto!

In [1]:
rdd_lines = sc.parallelize(["linea 1 Python","linea 2 Python","linea 3 Spark"] )

In [2]:
rdd_numbers = sc.parallelize([1,2,3,3] )

.collect() returns the whole rdd()

In [3]:
rdd_lines.collect()

['linea 1 Python', 'linea 2 Python', 'linea 3 Spark']

In [4]:
rdd_numbers.collect() # Action that returns the whole RDD

[1, 2, 3, 3]

## map vs. flatMap 
El flatMap reduce la dimensión de la lista y si es una lista de una dimensión, rompe la string.

In [5]:
rdd_lines.map(lambda x: x.split(' ')).collect()

[['linea', '1', 'Python'], ['linea', '2', 'Python'], ['linea', '3', 'Spark']]

In [6]:
rdd_lines.flatMap(lambda x: x.split(' ')).collect()

['linea', '1', 'Python', 'linea', '2', 'Python', 'linea', '3', 'Spark']

In [7]:
rdd_lines.map(lambda x:x).collect()

['linea 1 Python', 'linea 2 Python', 'linea 3 Spark']

In [8]:
rdd_lines.flatMap(lambda x:x).collect()

['l',
 'i',
 'n',
 'e',
 'a',
 ' ',
 '1',
 ' ',
 'P',
 'y',
 't',
 'h',
 'o',
 'n',
 'l',
 'i',
 'n',
 'e',
 'a',
 ' ',
 '2',
 ' ',
 'P',
 'y',
 't',
 'h',
 'o',
 'n',
 'l',
 'i',
 'n',
 'e',
 'a',
 ' ',
 '3',
 ' ',
 'S',
 'p',
 'a',
 'r',
 'k']

## filter

In [9]:
for item in ["linea 1 Python","linea 2 Python","linea 3 Spark"]:
    print("Python" in item)

True
True
False


In [10]:
rdd_lines.filter(lambda x: "Python" in x).collect()

['linea 1 Python', 'linea 2 Python']

# Juntando flatMap y filter

In [11]:
rdd_lines.flatMap(lambda x: x.split(' ')).filter(lambda x: "Python" in x).collect()

['Python', 'Python']

In [12]:
rdd_lines.filter(lambda x: "Python" in x).flatMap(lambda x: x.split(' ')).collect()

['linea', '1', 'Python', 'linea', '2', 'Python']

In [16]:
rdd_lines.cache()

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:194

## distinct

In [13]:
rdd_numbers = sc.parallelize([3,3,2,1] )
rdd_numbers.collect()

[3, 3, 2, 1]

In [14]:
rdd_numbers.distinct().collect()

[2, 3, 1]

## sample without replacement

In [17]:
rdd_numbers.sample(False,0.5).collect()

[3, 1]

## sample with replacement

In [27]:
rdd_numbers.sample(True,0.5).collect()

[3, 2]

---

# Transformation SET OPERATIONS

In [31]:
rdd_more_numbers = sc.parallelize([3,4,2,5])

## union

In [32]:
rdd_numbers.union(rdd_more_numbers).collect()

[3, 3, 2, 1, 3, 4, 2, 5]

## intersection

In [33]:
rdd_numbers.intersection(rdd_more_numbers).collect()

[2, 3]

## subtraction

In [34]:
rdd_numbers.subtract(rdd_more_numbers).collect()

[1]

## cartesian product

In [35]:
rdd_numbers.cartesian(rdd_more_numbers).collect()

[(3, 3),
 (3, 4),
 (3, 3),
 (3, 4),
 (3, 2),
 (3, 5),
 (3, 2),
 (3, 5),
 (2, 3),
 (2, 4),
 (1, 3),
 (1, 4),
 (2, 2),
 (2, 5),
 (1, 2),
 (1, 5)]

## Ejercicio: sumar (x,y) x+y

In [9]:
rdd1 = sc.parallelize([1,2,3,3] )

In [10]:
rdd2 = sc.parallelize([3,4,2,5])

In [12]:
rdd1.cartesian(rdd2).map(lambda x: "({}+{})={}".format(x[0],x[1],x[0]+x[1])).collect()

['(1+3)=4',
 '(1+4)=5',
 '(2+3)=5',
 '(2+4)=6',
 '(1+2)=3',
 '(1+5)=6',
 '(2+2)=4',
 '(2+5)=7',
 '(3+3)=6',
 '(3+4)=7',
 '(3+3)=6',
 '(3+4)=7',
 '(3+2)=5',
 '(3+5)=8',
 '(3+2)=5',
 '(3+5)=8']

---

# Spark - RDD - Basic Actions

## collect

In [44]:
rdd_numbers.collect()

[3, 3, 2, 1]

## count

In [45]:
rdd_numbers.count()

4

## countByValue - same as value_counts() in DataFrame in Pandas

In [46]:
rdd_many_numbers = rdd_numbers.union(rdd_more_numbers)

In [47]:
rdd_many_numbers.collect()

[3, 3, 2, 1, 3, 4, 2, 5]

In [48]:
rdd_many_numbers.countByValue()

defaultdict(int, {3: 3, 2: 2, 1: 1, 4: 1, 5: 1})

## Ejercicio: calcular el numero de ocurrencia de las palabras en rdd_lines

In [13]:
rdd3=sc.parallelize(["linea 1 Python","linea 2 Python","linea 3 Spark"] )

In [14]:
rdd3.flatMap(lambda x: x.split(' ')).filter(lambda x: not x.isnumeric()).countByValue()

defaultdict(int, {'linea': 3, 'Python': 2, 'Spark': 1})

## take - same as head() in DataFrame in Pandas

In [56]:
rdd_many_numbers.take(2)

[3, 3]

## top - return the highest values

In [59]:
rdd_more_numbers = sc.parallelize([3,4,5,2,5])

In [61]:
rdd_more_numbers.top(3)

[5, 5, 4]

In [62]:
rdd_more_numbers.take(3)

[3, 4, 5]

### Ejercicio: coger los 3 valores unicos máximos.

In [69]:
rdd_more_numbers.distinct().top(3)

[5, 4, 3]

## takeOrdered

In [75]:
rdd_more_numbers.collect()

[3, 4, 5, 2, 5]

In [76]:
rdd_more_numbers.take(3)

[3, 4, 5]

In [77]:
rdd_more_numbers.takeOrdered(3,lambda x: -x) # Descending

[5, 5, 4]

In [81]:
rdd_more_numbers.takeOrdered(3,lambda x: x) # Ascending

[2, 3, 4]

In [82]:
rdd_more_numbers.takeOrdered(3) # Ascending

[2, 3, 4]

---

In [89]:
rdd_more_numbers.persist

<bound method RDD.persist of ParallelCollectionRDD[86] at parallelize at PythonRDD.scala:194>

In [88]:
rdd_more_numbers.takeOrdered(rdd_more_numbers.count()) # Ascending

[2, 3, 4, 5, 5]

In [90]:
rdd_more_numbers.unpersist

<bound method RDD.unpersist of ParallelCollectionRDD[86] at parallelize at PythonRDD.scala:194>

---

In [79]:
rdd_more_numbers.takeOrdered(2,lambda x: -x**4 if x % 2 == 0 else -x) # Ascending

[4, 2]

## takeSample

In [93]:
rdd_many_numbers.takeSample(False,3,seed=321) #Without replacement

[5, 3, 4]

In [96]:
rdd_many_numbers.takeSample(True,10,seed=321) #With replacement

[2, 2, 2, 3, 3, 5, 2, 3, 1, 2]

In [101]:
for semilla in range(20):
    print(rdd_many_numbers.takeSample(True,10,seed=semilla)) #With replacement

[3, 3, 4, 2, 3, 3, 3, 4, 3, 5]
[3, 2, 2, 3, 1, 3, 2, 2, 2, 3]
[3, 3, 3, 2, 3, 3, 5, 1, 2, 1]
[3, 3, 3, 2, 3, 2, 1, 2, 3, 3]
[4, 4, 3, 1, 5, 3, 3, 4, 3, 3]
[4, 4, 3, 3, 5, 3, 5, 2, 4, 2]
[3, 4, 3, 3, 1, 1, 5, 4, 5, 5]
[3, 5, 2, 1, 1, 2, 2, 3, 2, 2]
[4, 2, 3, 1, 2, 3, 3, 4, 2, 3]
[4, 1, 5, 3, 5, 4, 4, 3, 4, 3]
[2, 3, 3, 4, 3, 3, 5, 4, 4, 3]
[3, 3, 2, 3, 3, 4, 2, 1, 3, 2]
[1, 3, 3, 1, 1, 2, 3, 4, 4, 3]
[5, 2, 4, 2, 3, 3, 4, 5, 2, 4]
[4, 5, 4, 3, 3, 3, 3, 3, 5, 4]
[5, 2, 4, 4, 2, 2, 2, 3, 4, 1]
[5, 2, 3, 3, 2, 2, 3, 3, 2, 3]
[3, 4, 5, 3, 3, 3, 4, 5, 3, 3]
[3, 5, 2, 3, 2, 3, 3, 2, 3, 5]
[2, 1, 2, 3, 1, 3, 3, 3, 2, 3]


---

# Spark - RDD - Reduce Actions - Reducing the whole list to a single value

## reduce

In [103]:
rdd = sc.parallelize([1, 2, 3, 4]) 
rdd.reduce(lambda a, b: a * b)

24

In [107]:
(1*2)*(3*4)

24

In [108]:
rdd_many_numbers.collect()

[3, 3, 2, 1, 3, 4, 2, 5]

In [109]:
rdd_many_numbers.reduce(lambda a, b: a * b)

2160

## fold - the same as reduce, but you can provide a starting value

In [110]:
sc.parallelize([1,25,8,4,2]).fold(0,lambda a,b:a+b)

40

In [111]:
1+25+8+4+2

40

In [120]:
sc.parallelize([1,25,8,4,2]).fold(1,lambda a,b:a+b)

43

## aggregate

In [115]:
sc.parallelize([1,2,3,4,5]).aggregate(
  (0, 0),
  (lambda acc, value: (acc[0] + value, acc[1] + 1)),
  (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])))

(15, 5)

## reduce by Key

In [121]:
rdd = sc.parallelize([(1,2), (3,4), (3,6)])
rdd.collect()

[(1, 2), (3, 4), (3, 6)]

In [122]:
rdd.reduceByKey(lambda a, b: a + b).collect()

[(1, 2), (3, 10)]

## Persistent (Catching)

In [None]:
rdd.persist

In [None]:
rdd.count()

## Cache

In [None]:
rdd_cached_lines = rdd_lines.cache()

In [None]:
rdd_cached_lines.collect()

In [None]:
rdd_cached_lines.count()

---

# Example 1

In [124]:
lines = sc.parallelize(["linea 1 Python","linea 2 Python","linea 3 Spark"] )

In [125]:
lines.map(lambda x: x.split(' ')).filter(lambda x : "2" in x).map(lambda word : (word, 1)).collect()

[(['linea', '2', 'Python'], 1)]

In [126]:
lines.flatMap(lambda x: x.split(' ')).collect()

['linea', '1', 'Python', 'linea', '2', 'Python', 'linea', '3', 'Spark']

In [127]:
lines.map(lambda x: x.split(' ')).collect()

[['linea', '1', 'Python'], ['linea', '2', 'Python'], ['linea', '3', 'Spark']]

In [128]:
lines.flatMap(lambda x: x.split(' ')).filter(lambda x : "Python" in x).collect()

['Python', 'Python']

In [129]:
lines.flatMap(lambda x: x.split(' ')).filter(lambda x : "Python" in x).map(lambda word : (word, 1)).collect()

[('Python', 1), ('Python', 1)]

In [131]:
from operator import add

In [132]:
lines.flatMap(lambda x: x.split(' ')).filter(lambda x : "Python" in x).map(lambda word : (word, 1)).reduceByKey(add).collect()

[('Python', 2)]

In [133]:
from operator import add

In [134]:
lines.flatMap(lambda x: x.split(' ')).filter(lambda x : "Python" in x or "Spark" in x).map(lambda word : (word, 1)) \
.reduceByKey(add).collect()

[('Python', 2), ('Spark', 1)]

In [135]:
sc.parallelize(range(1,1000001)).filter(lambda x: x%3==0).sum()

166666833333