# Pair RDD - Transformations

## Prepare the Spark context

In [None]:
# Import findpsark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the context creation
from pyspark import SparkConf, SparkContext

# Configure and create the context
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
sc = SparkContext.getOrCreate(conf=conf)

## sortByKey

In [None]:
rdd1 = sc.parallelize(['2014-12-31', '2015-01-25', '2016-05-17', '2016-11-08', '2017-01-05', '2014-08-06'])
rdd2 = rdd1.groupBy(lambda element: element.split('-')[0])
print([(key, list(value)) for (key, value) in rdd2.collect()])
rdd3 = rdd2.sortByKey()
print([(key, list(value)) for (key, value) in rdd3.collect()])

## reduceByKey

In [None]:
rdd1 = sc.parallelize(['2014-12-31', '2015-01-25', '2016-05-17', '2016-11-08', '2017-01-05', '2014-08-06'])

def parseDate(date):
    year, month, day = date.split('-')
    return (year, month + '-' + day)

rdd2 = rdd1.map(parseDate)

def maxDate(date1, date2):
    if date1 > date2:
        return date1
    else:
        return date2
    
rdd3 = rdd2.reduceByKey(maxDate).sortByKey()
print(rdd2.collect())
print(rdd3.collect())

## join, leftOuterJoin, rightOuterJoin, fullOuterJoin

In [None]:
rdd1 = sc.parallelize([(2014, 10000), (2015, 20000), (2016, 30000)])
rdd2 = sc.parallelize([(2014, -500), (2016, -6000), (2017, -9000)])

In [None]:
rdd3 = rdd1.join(rdd2).sortByKey()
print(rdd3.collect())

In [None]:
rdd3 = rdd1.leftOuterJoin(rdd2).sortByKey()
print(rdd3.collect())

In [None]:
rdd3 = rdd1.rightOuterJoin(rdd2).sortByKey()
print(rdd3.collect())

In [None]:
rdd3 = rdd1.fullOuterJoin(rdd2).sortByKey()
print(rdd3.collect())

## Close the Spark context

In [None]:
sc.stop()