<div style="font-size: 200%; font-weight: bold; padding-bottom: 1em;">Preparation</div>
The following instructions are need to create a spark context. If the notebook kernel already provides a spark context these commands should not be included.


In [84]:
"""
Load packages and create context objects...
"""
import os
import platform
import sys
if not 'sc' in vars():
    sys.path.append('/usr/hdp/2.4.2.0-258/spark/python')
    os.environ["SPARK_HOME"] = '/usr/hdp/2.4.2.0-258/spark'
    os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.11:1.2.0 pyspark-shell'
    import py4j
    import pyspark
    from pyspark.context import SparkContext, SparkConf
    from pyspark.sql import SQLContext, HiveContext
    from pyspark.storagelevel import StorageLevel
    sc = SparkContext()
    import atexit
    atexit.register(lambda: sc.stop())
    print("""Welcome to
          ____              __
         / __/__  ___ _____/ /__
        _\ \/ _ \/ _ `/ __/  '_/
       /__ / .__/\_,_/_/ /_/\_\   version %s
          /_/
    """ % sc.version)
else:
    print("""Already running
          ____              __
         / __/__  ___ _____/ /__
        _\ \/ _ \/ _ `/ __/  '_/
       /__ / .__/\_,_/_/ /_/\_\   version %s
          /_/
    """ % sc.version)

Already running
          ____              __
         / __/__  ___ _____/ /__
        _\ \/ _ \/ _ `/ __/  '_/
       /__ / .__/\_,_/_/ /_/\_\   version 1.6.1
          /_/
    


# Loading Data into RDD 

Data files are stored at
<pre>
    $ hdfs dfs -ls /user/pmolnar/data/AdventureWorks
    Found 4 items
    -rw-r--r--   3 pmolnar hadoop        466 2017-02-07 22:46 /user/pmolnar/data/AdventureWorks/Customer.csv.gz
    -rw-r--r--   3 pmolnar hadoop      18125 2017-02-07 22:46 /user/pmolnar/data/AdventureWorks/Employees.csv.gz
    -rw-r--r--   3 pmolnar hadoop        603 2017-02-07 22:46 /user/pmolnar/data/AdventureWorks/ItemsOrdered.csv.gz
    -rw-r--r--   3 pmolnar hadoop        404 2017-02-07 22:46 /user/pmolnar/data/AdventureWorks/SalesTerritory.csv.gz
</pre>

In [41]:
customer_rdd = sc.textFile('/user/pmolnar/data/AdventureWorks/Customer.csv.gz')

In [42]:
print '\n'.join(customer_rdd.take(10))

CustomerID,SalesTerritoryID,FirstName,LastName,City,StateName
10101,1,John,Gray,Lynden,Washington
10298,4,Leroy,Brown,Pinetop,Arizona
10299,1,Elroy,Keller,Snoqualmie,Washington
10315,3,Lisa,Jones,Oshkosh,Wisconsin
10325,1,Ginger,Schultz,Pocatello,Idaho
10329,5,Kelly,Mendoza,Kailua,Hawaii
10330,1,Shawn,Dalton,Cannon Beach,Oregon
10338,1,Michael,Howell,Tillamook,Oregon
10339,4,Anthony,Sanchez,Winslow,Arizona


In [55]:
hdr = customer_rdd.first()

In [56]:
customer2_rdd = customer_rdd.repartition(4).filter(lambda x: x!=hdr)

In [57]:
print '\n'.join(customer2_rdd.take(10))

10101,1,John,Gray,Lynden,Washington
10325,1,Ginger,Schultz,Pocatello,Idaho
10339,4,Anthony,Sanchez,Winslow,Arizona
10419,4,Linda,Sakahara,Nogales,Arizona
10449,4,Isabela,Moore,Yuma,Arizona
10298,4,Leroy,Brown,Pinetop,Arizona
10329,5,Kelly,Mendoza,Kailua,Hawaii
10408,4,Elroy,Cleaver,Globe,Arizona
10429,5,Sarah,Graham,Greensboro,North Carolina
10299,1,Elroy,Keller,Snoqualmie,Washington


In [78]:
#customer3_rdd = customer2_rdd.map(lambda x: x.split(','))
customer3_rdd = customer2_rdd.map(lambda x: x.split(',')).map(lambda x: (int(x[0]), int(x[1]), x[2], x[3], x[4], x[5]))

In [79]:
for t in customer3_rdd.take(10):
    print t

(10101, 1, u'John', u'Gray', u'Lynden', u'Washington')
(10325, 1, u'Ginger', u'Schultz', u'Pocatello', u'Idaho')
(10339, 4, u'Anthony', u'Sanchez', u'Winslow', u'Arizona')
(10419, 4, u'Linda', u'Sakahara', u'Nogales', u'Arizona')
(10449, 4, u'Isabela', u'Moore', u'Yuma', u'Arizona')
(10298, 4, u'Leroy', u'Brown', u'Pinetop', u'Arizona')
(10329, 5, u'Kelly', u'Mendoza', u'Kailua', u'Hawaii')
(10408, 4, u'Elroy', u'Cleaver', u'Globe', u'Arizona')
(10429, 5, u'Sarah', u'Graham', u'Greensboro', u'North Carolina')
(10299, 1, u'Elroy', u'Keller', u'Snoqualmie', u'Washington')


# RDD Transformations

## map()

In [3]:
x = sc.parallelize(["b", "a", "c"])
y = x.map(lambda z: (z, 1))
print x.collect()
print y.collect()

['b', 'a', 'c']
[('b', 1), ('a', 1), ('c', 1)]


## filter()

In [4]:
x = sc.parallelize([1,2,3])
y = x.filter(lambda x: x%2 == 1) #keep odd values print(x.collect())
print y.collect() 

[1, 3]


## flatMap()

In [6]:
x = sc.parallelize([1,2,3])
y = x.flatMap(lambda x: (x, x*100, 42))
print x.collect() 
print y.collect()

[1, 2, 3]
[1, 100, 42, 2, 200, 42, 3, 300, 42]


## groupBy()

In [8]:
x = sc.parallelize(['John', 'Fred', 'Anna', 'James'])
y = x.groupBy(lambda w: w[0])
print [(k, list(v)) for (k, v) in y.collect()]

[('A', ['Anna']), ('F', ['Fred']), ('J', ['John', 'James'])]


## groupByKey()

In [10]:
x = sc.parallelize([('B',5),('B',4),('A',3),('A',2),('A',1)])
y = x.groupByKey()
print x.collect() 
print list((j[0], list(j[1])) for j in y.collect())

[('B', 5), ('B', 4), ('A', 3), ('A', 2), ('A', 1)]
[('A', [3, 2, 1]), ('B', [5, 4])]


## mapPartitions()

In [11]:
x = sc.parallelize([1,2,3], 2)
def f(iterator): yield sum(iterator); yield 42
y = x.mapPartitions(f)
# glom() flattens elements on the same partition
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3]]
[[1, 42], [5, 42]]


## mapPartitionWithIndex

In [12]:
x = sc.parallelize([1,2,3], 2)
def f(partitionIndex, iterator):
    yield (partitionIndex, sum(iterator))
    
y = x.mapPartitionsWithIndex(f)
# glom() flattens elements on the same partition
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3]]
[[(0, 1)], [(1, 5)]]


## sample()

In [13]:
x = sc.parallelize([1, 2, 3, 4, 5])
y = x.sample(False, 0.4, 42)
print(x.collect())
print(y.collect())

[1, 2, 3, 4, 5]
[2, 5]


## union()

In [14]:
x = sc.parallelize([1,2,3], 2)
y = sc.parallelize([3,4], 1)
z = x.union(y)
print(z.glom().collect())

[[1], [2, 3], [3, 4]]


## join()

In [15]:
x = sc.parallelize([("a", 1), ("b", 2)])
y = sc.parallelize([("a", 3), ("a", 4), ("b", 5)])
z = x.join(y)
print(z.collect())

[('a', (1, 3)), ('a', (1, 4)), ('b', (2, 5))]


## distinct()

In [16]:
x = sc.parallelize([1,2,3,3,4])
y = x.distinct()
print(y.collect())

[1, 2, 3, 4]


## coalesce()

In [17]:
x = sc.parallelize([1, 2, 3, 4, 5], 3)
y = x.coalesce(2)
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3], [4, 5]]
[[1], [2, 3, 4, 5]]


## keyBy()

In [18]:
x = sc.parallelize(['John', 'Fred', 'Anna', 'James'])
y = x.keyBy(lambda w: w[0])
print y.collect()

[('J', 'John'), ('F', 'Fred'), ('A', 'Anna'), ('J', 'James')]


## partitionBy()

In [19]:
x = sc.parallelize([('J','James'),('F','Fred'), ('A','Anna'),('J','John')], 3)
y = x.partitionBy(2, lambda w: 0 if w[0] < 'H' else 1)
print x.glom().collect()
print y.glom().collect()

[[('J', 'James')], [('F', 'Fred')], [('A', 'Anna'), ('J', 'John')]]
[[('F', 'Fred'), ('A', 'Anna')], [('J', 'James'), ('J', 'John')]]


## zip()

In [20]:
x = sc.parallelize([1, 2, 3])
y = x.map(lambda n:n*n)
z = x.zip(y)
print(z.collect())

[(1, 1), (2, 4), (3, 9)]


# RDD Actions

## getNumPartitions()

In [21]:
x = sc.parallelize([1,2,3], 2)
y = x.getNumPartitions()
print(x.glom().collect())
print(y)

[[1], [2, 3]]
2


## collect()

In [22]:
x = sc.parallelize([1,2,3], 2)
y = x.collect()
print(x.glom().collect())
print(y)

[[1], [2, 3]]
[1, 2, 3]


## reduce()

In [23]:
x = sc.parallelize([1,2,3,4])
y = x.reduce(lambda a,b: a+b)
print(x.collect())
print(y)

[1, 2, 3, 4]
10


## aggregate()

In [24]:
seqOp = lambda data, item: (data[0] + [item], data[1] + item)
combOp = lambda d1, d2: (d1[0] + d2[0], d1[1] + d2[1])

x = sc.parallelize([1,2,3,4])

y = x.aggregate(([], 0), seqOp, combOp)

print(y)

([1, 2, 3, 4], 10)


## max()

In [26]:
x = sc.parallelize([2,4,1])
y = x.max()
print(x.collect())
print(y)

[2, 4, 1]
4


## sum()

In [27]:
x = sc.parallelize([2,4,1])
y = x.sum()
print(x.collect())
print(y)

[2, 4, 1]
7


## mean()

In [28]:
x = sc.parallelize([2,4,1])
y = x.mean()
print(x.collect())
print(y)

[2, 4, 1]
2.33333333333


## stdev()

In [29]:
x = sc.parallelize([2,4,1])
y = x.stdev()
print(x.collect())
print(y)

[2, 4, 1]
1.24721912892


## countByKey()

In [30]:
x = sc.parallelize([('J', 'James'), ('F','Fred'), ('A','Anna'), ('J','John')])
y = x.countByKey()
print(y)

defaultdict(<type 'int'>, {'A': 1, 'J': 2, 'F': 1})


# Saving RDD

## saveAsTextFile()
Make sure output file does not exist

In [33]:
x = sc.parallelize([2,4,1])
x.saveAsTextFile("/user/pmolnar/demo2")
y = sc.textFile("/user/pmolnar/demo2")
print(y.collect())

[u'2', u'4', u'1']
