# Spark - RDD - Basic Transformations

In [None]:
#!pip install findspark

In [1]:
import pyspark
sc = pyspark.SparkContext()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/02 10:12:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/02 10:12:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Creando un RDD con 3 lineas de texto!

In [2]:
rdd_lines = sc.parallelize(["linea 1 Python","linea 2 Python","linea 3 Spark"] )

In [3]:
rdd_numbers = sc.parallelize([1,2,3,3] )

.collect() returns the whole rdd()

In [None]:
rdd_lines.collect()

In [None]:
rdd_numbers.collect() # Action that returns the whole RDD

## map vs. flatMap 
El flatMap reduce la dimensión de la lista y si es una lista de una dimensión, rompe la string.

In [None]:
rdd_lines.map(lambda x: x.split(' ')).collect()

In [None]:
rdd_lines.flatMap(lambda x: x.split(' ')).collect()

In [None]:
rdd_lines.map(lambda x:x).collect()

In [None]:
rdd_lines.flatMap(lambda x:x).collect()

## filter

In [None]:
for item in ["linea 1 Python","linea 2 Python","linea 3 Spark"]:
    print("Python" in item)

In [None]:
rdd_lines.filter(lambda x: "Python" in x).collect()

# Juntando flatMap y filter

In [None]:
rdd_lines.flatMap(lambda x: x.split(' ')).filter(lambda x: "Python" in x).collect()

In [None]:
rdd_lines.filter(lambda x: "Python" in x).flatMap(lambda x: x.split(' ')).collect()

In [None]:
rdd_lines.cache()

## distinct

In [None]:
rdd_numbers = sc.parallelize([3,3,2,1] )
rdd_numbers.collect()

In [None]:
rdd_numbers.distinct().collect()

## sample without replacement

In [None]:
rdd_numbers.sample(False,0.5).collect()

## sample with replacement

In [None]:
rdd_numbers.sample(True,0.5).collect()

---

# Transformation SET OPERATIONS

In [None]:
rdd_more_numbers = sc.parallelize([3,4,2,5])

## union

In [None]:
rdd_numbers.union(rdd_more_numbers).collect()

## intersection

In [None]:
rdd_numbers.intersection(rdd_more_numbers).collect()

## subtraction

In [None]:
rdd_numbers.subtract(rdd_more_numbers).collect()

## cartesian product

In [None]:
rdd_numbers.cartesian(rdd_more_numbers).collect()

## Ejercicio: sumar (x,y) x+y

In [None]:
rdd1 = sc.parallelize([1,2,3,3] )

In [None]:
rdd2 = sc.parallelize([3,4,2,5])

In [None]:
rdd1.cartesian(rdd2).map(lambda x: "({}+{})={}".format(x[0],x[1],x[0]+x[1])).collect()

---

# Spark - RDD - Basic Actions

## collect

In [None]:
rdd_numbers.collect()

## count

In [None]:
rdd_numbers.count()

## countByValue - same as value_counts() in DataFrame in Pandas

In [None]:
rdd_many_numbers = rdd_numbers.union(rdd_more_numbers)

In [None]:
rdd_many_numbers.collect()

In [None]:
rdd_many_numbers.countByValue()

## Ejercicio: calcular el numero de ocurrencia de las palabras en rdd_lines

In [None]:
rdd3=sc.parallelize(["linea 1 Python","linea 2 Python","linea 3 Spark"] )

In [None]:
rdd3.flatMap(lambda x: x.split(' ')).filter(lambda x: not x.isnumeric()).countByValue()

## take - same as head() in DataFrame in Pandas

In [None]:
rdd_many_numbers.take(2)

## top - return the highest values

In [None]:
rdd_more_numbers = sc.parallelize([3,4,5,2,5])

In [None]:
rdd_more_numbers.top(3)

In [None]:
rdd_more_numbers.take(3)

### Ejercicio: coger los 3 valores unicos máximos.

In [None]:
rdd_more_numbers.distinct().top(3)

## takeOrdered

In [None]:
rdd_more_numbers.collect()

In [None]:
rdd_more_numbers.take(3)

In [None]:
rdd_more_numbers.takeOrdered(3,lambda x: -x) # Descending

In [None]:
rdd_more_numbers.takeOrdered(3,lambda x: x) # Ascending

In [None]:
rdd_more_numbers.takeOrdered(3) # Ascending

---

In [None]:
rdd_more_numbers.persist

In [None]:
rdd_more_numbers.takeOrdered(rdd_more_numbers.count()) # Ascending

In [None]:
rdd_more_numbers.unpersist

---

In [None]:
rdd_more_numbers.takeOrdered(2,lambda x: -x**4 if x % 2 == 0 else -x) # Ascending

## takeSample

In [None]:
rdd_many_numbers.takeSample(False,3,seed=321) #Without replacement

In [None]:
rdd_many_numbers.takeSample(True,10,seed=321) #With replacement

In [None]:
for semilla in range(20):
    print(rdd_many_numbers.takeSample(True,10,seed=semilla)) #With replacement

---

# Spark - RDD - Reduce Actions - Reducing the whole list to a single value

## reduce

In [None]:
rdd = sc.parallelize([1, 2, 3, 4]) 
rdd.reduce(lambda a, b: a * b)

In [None]:
(1*2)*(3*4)

In [None]:
rdd_many_numbers.collect()

In [None]:
rdd_many_numbers.reduce(lambda a, b: a * b)

## fold - the same as reduce, but you can provide a starting value

In [None]:
sc.parallelize([1,25,8,4,2]).fold(0,lambda a,b:a+b)

In [None]:
1+25+8+4+2

In [None]:
sc.parallelize([1,25,8,4,2]).fold(1,lambda a,b:a+b)

## aggregate

In [None]:
sc.parallelize([1,2,3,4,5]).aggregate(
  (0, 0),
  (lambda acc, value: (acc[0] + value, acc[1] + 1)),
  (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])))

## reduce by Key

In [None]:
rdd = sc.parallelize([(1,2), (3,4), (3,6)])
rdd.collect()

In [None]:
rdd.reduceByKey(lambda a, b: a + b).collect()

## Persistent (Catching)

In [None]:
rdd.persist

In [None]:
rdd.count()

## Cache

In [None]:
rdd_cached_lines = rdd_lines.cache()

In [None]:
rdd_cached_lines.collect()

In [None]:
rdd_cached_lines.count()

---

# Example 1

In [None]:
lines = sc.parallelize(["linea 1 Python","linea 2 Python","linea 3 Spark"] )

In [None]:
lines.map(lambda x: x.split(' ')).filter(lambda x : "2" in x).map(lambda word : (word, 1)).collect()

In [None]:
lines.flatMap(lambda x: x.split(' ')).collect()

In [None]:
lines.map(lambda x: x.split(' ')).collect()

In [None]:
lines.flatMap(lambda x: x.split(' ')).filter(lambda x : "Python" in x).collect()

In [None]:
lines.flatMap(lambda x: x.split(' ')).filter(lambda x : "Python" in x).map(lambda word : (word, 1)).collect()

In [None]:
from operator import add

In [None]:
lines.flatMap(lambda x: x.split(' ')).filter(lambda x : "Python" in x).map(lambda word : (word, 1)).reduceByKey(add).collect()

In [None]:
from operator import add

In [None]:
lines.flatMap(lambda x: x.split(' ')).filter(lambda x : "Python" in x or "Spark" in x).map(lambda word : (word, 1)) \
.reduceByKey(add).collect()

In [None]:
sc.parallelize(range(1,1000001)).filter(lambda x: x%3==0).sum()