<div style="font-size: 200%; font-weight: bold; padding-bottom: 1em;">Preparation</div>
The following instructions are need to create a spark context. If the notebook kernel already provides a spark context these commands should not be included.


In [None]:
# %load pyspark_init_mac.py
#
# This configuration works for Spark on macOS using homebrew
#
import os, sys
# set OS environment variable
os.environ["SPARK_HOME"] = '/usr/local/Cellar/apache-spark/2.2.0/libexec'
# add Spark library to Python
sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python'))

# import package
import pyspark
from pyspark.context import SparkContext, SparkConf

import atexit
def stop_my_spark():
    sc.stop()
    del(sc)

# Register exit    
atexit.register(stop_my_spark)

# Configure and start Spark ... but only once.
if not 'sc' in globals():
    conf = SparkConf()
    conf.setAppName('MyFirstSpark') ## you may want to change this
    conf.setMaster('local[2]')
    sc = SparkContext()
    print "Launched Spark version %s with ID %s" % (sc.version, sc.applicationId)



# Loading Data into RDD 

On ARC the data files are stored at
<pre>
    $ hdfs dfs -ls /user/pmolnar/data/AdventureWorks
    Found 4 items
    -rw-r--r--   3 pmolnar hadoop        466 2017-02-07 22:46 /user/pmolnar/data/AdventureWorks/Customer.csv.gz
    -rw-r--r--   3 pmolnar hadoop      18125 2017-02-07 22:46 /user/pmolnar/data/AdventureWorks/Employees.csv.gz
    -rw-r--r--   3 pmolnar hadoop        603 2017-02-07 22:46 /user/pmolnar/data/AdventureWorks/ItemsOrdered.csv.gz
    -rw-r--r--   3 pmolnar hadoop        404 2017-02-07 22:46 /user/pmolnar/data/AdventureWorks/SalesTerritory.csv.gz
</pre>

Spark will by default search the HDFS on the cluster. When running Spark on a personal computer it most likely uses the local file system.

In [None]:
customer_rdd = sc.textFile('/user/pmolnar/data/AdventureWorks/Customer.csv.gz')

In [None]:
print customer_rdd

In [None]:
# print '\n'.join(customer_rdd.take(10))
print customer_rdd.take(10)

In [None]:
hdr = customer_rdd.first()

In [None]:
print hdr

In [None]:
customer2_rdd = customer_rdd.repartition(4).filter(lambda x: x!=hdr)

In [None]:
print '\n'.join(customer2_rdd.take(10))

In [None]:
#customer3_rdd = customer2_rdd.map(lambda x: x.split(','))
customer3_rdd = customer2_rdd.map(lambda x: x.split(',')).map(lambda x: (int(x[0]), int(x[1]), x[2], x[3], x[4], x[5]))

In [None]:
for t in customer3_rdd.take(10):
    print t

# RDD Transformations

## map()

In [None]:
x = sc.parallelize(["b", "a", "c"])
y = x.map(lambda z: (z, 1))
print x.collect()
print y.collect()

## filter()

In [None]:
x = sc.parallelize([1,2,3])
y = x.filter(lambda x: x%2 == 1) #keep odd values print(x.collect())
print y.collect() 

## flatMap()

In [None]:
x = sc.parallelize([1,2,3])
y = x.flatMap(lambda x: (x, x*100, 42))
print x.collect() 
print y.collect()

## groupBy()

In [None]:
x = sc.parallelize(['John', 'Fred', 'Anna', 'James'])
y = x.groupBy(lambda w: w[0])
print [(k, list(v)) for (k, v) in y.collect()]

## groupByKey()

In [None]:
x = sc.parallelize([('B',5),('B',4),('A',3),('A',2),('A',1)])
y = x.groupByKey()
print x.collect() 
print list((j[0], list(j[1])) for j in y.collect())

## mapPartitions()

In [None]:
x = sc.parallelize([1,2,3], 2)
def f(iterator): yield sum(iterator); yield 42
y = x.mapPartitions(f)
# glom() flattens elements on the same partition
print(x.glom().collect())
print(y.glom().collect())

## mapPartitionWithIndex

In [None]:
x = sc.parallelize([1,2,3], 2)
def f(partitionIndex, iterator):
    yield (partitionIndex, sum(iterator))
    
y = x.mapPartitionsWithIndex(f)
# glom() flattens elements on the same partition
print(x.glom().collect())
print(y.glom().collect())

## sample()

In [None]:
x = sc.parallelize([1, 2, 3, 4, 5])
y = x.sample(False, 0.4, 42)
print(x.collect())
print(y.collect())

## union()

In [None]:
x = sc.parallelize([1,2,3], 2)
y = sc.parallelize([3,4], 1)
z = x.union(y)
print(z.glom().collect())

## join()

In [None]:
x = sc.parallelize([("a", 1), ("b", 2)])
y = sc.parallelize([("a", 3), ("a", 4), ("b", 5)])
z = x.join(y)
print(z.collect())

## distinct()

In [None]:
x = sc.parallelize([1,2,3,3,4])
y = x.distinct()
print(y.collect())

## coalesce()

In [None]:
x = sc.parallelize([1, 2, 3, 4, 5], 3)
y = x.coalesce(2)
print(x.glom().collect())
print(y.glom().collect())

## keyBy()

In [None]:
x = sc.parallelize(['John', 'Fred', 'Anna', 'James'])
y = x.keyBy(lambda w: w[0])
print y.collect()

## partitionBy()

In [None]:
x = sc.parallelize([('J','James'),('F','Fred'), ('A','Anna'),('J','John')], 3)
y = x.partitionBy(2, lambda w: 0 if w[0] < 'H' else 1)
print x.glom().collect()
print y.glom().collect()

## zip()

In [None]:
x = sc.parallelize([1, 2, 3])
y = x.map(lambda n:n*n)
z = x.zip(y)
print(z.collect())

# RDD Actions

## getNumPartitions()

In [None]:
x = sc.parallelize([1,2,3], 2)
y = x.getNumPartitions()
print(x.glom().collect())
print(y)

## collect()

In [None]:
x = sc.parallelize([1,2,3], 2)
y = x.collect()
print(x.glom().collect())
print(y)

## reduce()

In [None]:
x = sc.parallelize([1,2,3,4])
y = x.reduce(lambda a,b: a+b)
print(x.collect())
print(y)

## aggregate()

In [None]:
seqOp = lambda data, item: (data[0] + [item], data[1] + item)
combOp = lambda d1, d2: (d1[0] + d2[0], d1[1] + d2[1])

x = sc.parallelize([1,2,3,4])

y = x.aggregate(([], 0), seqOp, combOp)

print(y)

## max()

In [None]:
x = sc.parallelize([2,4,1])
y = x.max()
print(x.collect())
print(y)

## sum()

In [None]:
x = sc.parallelize([2,4,1])
y = x.sum()
print(x.collect())
print(y)

## mean()

In [None]:
x = sc.parallelize([2,4,1])
y = x.mean()
print(x.collect())
print(y)

## stdev()

In [None]:
x = sc.parallelize([2,4,1])
y = x.stdev()
print(x.collect())
print(y)

## countByKey()

In [None]:
x = sc.parallelize([('J', 'James'), ('F','Fred'), ('A','Anna'), ('J','John')])
y = x.countByKey()
print(y)

# Saving RDD

## saveAsTextFile()
Make sure output file does not exist

In [None]:
x = sc.parallelize([2,4,1])
x.saveAsTextFile("demo2")
y = sc.textFile("demo2")
print(y.collect())