# RDD - Transformations

## Prepare the Spark context

In [None]:
# Import findpsark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the context creation
from pyspark import SparkConf, SparkContext

# Configure and create the context
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
sc = SparkContext.getOrCreate(conf=conf)

## map

In [None]:
rdd1 = sc.parallelize(['a', 'b', 'c', 'a', 'c'])
rdd2 = rdd1.map(lambda element: (element, 1))
print(rdd1.collect())
print(rdd2.collect())

## flatMap

In [None]:
def splitLine(line):
    return line.split()

rdd1 = sc.parallelize(['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme'])
rdd2 = rdd1.map(splitLine)
print(rdd1.collect())
print(rdd2.collect())

In [None]:
rdd1 = sc.parallelize(['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme'])
rdd2 = rdd1.flatMap(splitLine)
print(rdd1.collect())
print(rdd2.collect())

In [None]:
def splitLineAndRemove(line):
    if line != 'de la mancha':
        return line.split()
    else:
        return ''

In [None]:
rdd1 = sc.parallelize(['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme'])
rdd2 = rdd1.map(splitLineAndRemove)
print(rdd1.collect())
print(rdd2.collect())

In [None]:
rdd1 = sc.parallelize(['en un lugar', 'de la mancha', 'de cuyo nombre', 'no quiero acordarme'])
rdd2 = rdd1.flatMap(splitLineAndRemove)
print(rdd1.collect())
print(rdd2.collect())

## filter

In [None]:
rdd1 = sc.parallelize(list(range(10)))
rdd2 = rdd1.filter(lambda element: element % 2 == 0)
print(rdd1.collect())
print(rdd2.collect())

## groupBy

In [None]:
rdd1 = sc.parallelize(['2014-12-31', '2015-01-25', '2016-05-17', '2016-11-08', '2017-01-05', '2014-08-06'])
rdd2 = rdd1.groupBy(lambda element: element.split('-')[0])
print(rdd1.collect())
print(rdd2.collect())
print([(key, list(value)) for (key, value) in rdd2.collect()])

## sortBy

In [None]:
rdd1 = sc.parallelize([('a', 1), ('c', 10), ('b', 2)])
rdd2 = rdd1.sortBy(lambda element: element[0])
rdd3 = rdd1.sortBy(lambda element: element[1], ascending = False)
print(rdd1.collect())
print(rdd2.collect())
print(rdd3.collect())

## union

In [None]:
rdd1 = sc.parallelize(['daniel', 'jose', 'miguel', 'juan'])
rdd2 = sc.parallelize(['maria', 'rocio', 'cristina', 'lucia', 'juan'])
rdd3 = rdd1.union(rdd2)
print(rdd3.collect())

## intersection

In [None]:
rdd1 = sc.parallelize(['daniel', 'jose', 'miguel', 'maria', 'lucia'])
rdd2 = sc.parallelize(['maria', 'rocio', 'cristina', 'lucia'])
rdd3 = rdd1.intersection(rdd2)
print(rdd3.collect())

## cartesian

In [None]:
rdd1 = sc.parallelize(['daniel', 'jose', 'miguel', 'maria', 'lucia'])
rdd2 = sc.parallelize(['maria', 'rocio', 'cristina', 'lucia'])
rdd3 = rdd1.cartesian(rdd2)
print(rdd3.collect())

## distinct 

In [None]:
rdd1 = sc.parallelize(['madrid', 'barcelona', 'madrid', 'valencia', 'sevilla', 'valencia'])
rdd2 = rdd1.distinct()
print(rdd1.collect())
print(rdd2.collect())

## sample

In [None]:
rdd1 = sc.parallelize(list(range(100)))
rdd2 = rdd1.sample(False, 0.1)
print(rdd2.collect())

## Stop the Spark context

In [None]:
sc.stop()