In [8]:
import findspark
findspark.init()
from pyspark import SparkContext

## Map, Filter and Reduce in Python

In [35]:
a = range(1,10)
a

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [50]:
t = [(1,5), (3,9), (5,8)]
t

[(1, 5), (3, 9), (5, 8)]

### map

In [38]:
map(lambda x:x**2, a)

[1, 4, 9, 16, 25, 36, 49, 64, 81]

In [65]:
map(lambda t:t[0], t)

[1, 3, 5]

In [61]:
map(lambda t:t[0]*t[1], t)

[5, 27, 40]

In [53]:
map(lambda t:range(t[0],t[1]), t)

[[1, 2, 3, 4], [3, 4, 5, 6, 7, 8], [5, 6, 7]]

### filter

In [39]:
filter(lambda x:x%3==0, a)

[3, 6, 9]

In [67]:
filter(lambda t:t[0]>2, t)

[(3, 9), (5, 8)]

### reduce

In [41]:
reduce(lambda x,y:x+y, a)

45

In [62]:
reduce(lambda x,y:x+y, map(lambda t:t[0]*t[1], t))

72

### flattening

In [82]:
from itertools import chain

b = map(lambda t: range(t[0],t[1]) ,[(1,5), (10,15)])

flat = list(chain.from_iterable(b))

print b
print flat

[[1, 2, 3, 4], [10, 11, 12, 13, 14]]
[1, 2, 3, 4, 10, 11, 12, 13, 14]


# start Spark
***

## SparkContext
***

In [None]:
sc = SparkContext()

## parallelize
***

In [87]:
list_rdd = sc.parallelize(range(1,10))

In [17]:
age = [("Jimmy", 18),
       ("Bob", 22),
       ("Rod", 29)]
salary = [("Jimmy", 20000),
          ("Bob", 23000),
          ("Rod", 25000)]
age_rdd = sc.parallelize(age)
salary_rdd = sc.parallelize(salary)

### take

In [88]:
list_rdd.take(5)

[1, 2, 3, 4, 5]

### collect

In [89]:
list_rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [104]:
age_rdd.collect()

[('Jimmy', 18), ('Bob', 22), ('Rod', 29)]

In [105]:
salary_rdd.collect()

[('Jimmy', 20000), ('Bob', 23000), ('Rod', 25000)]

## map, filter and reduce in Spark
***

### map
***

In [92]:
mapList = list_rdd.map(lambda x:x**2)

In [93]:
mapList.collect()

[1, 4, 9, 16, 25, 36, 49, 64, 81]

In [106]:
age_rdd.map(lambda x:x[0]).collect()

['Jimmy', 'Bob', 'Rod']

### filter
***

In [94]:
filterList = list_rdd.filter(lambda x:x%2==0)

In [95]:
filterList.collect()

[2, 4, 6, 8]

In [111]:
salary_rdd.filter(lambda x:x[1]>21000).collect()

[('Bob', 23000), ('Rod', 25000)]

In [112]:
salary_rdd.filter(lambda x:x[1]>21000).map(lambda x:x[0]).collect()

['Bob', 'Rod']

### reduce
***

In [98]:
reduceList = list_rdd.reduce(lambda x,y:x+y)
reduceList

45

In [114]:
age_rdd.filter(lambda x:x[1]>20).map(lambda x:x[1]).reduce(lambda x,y:x+y)

51

### other reduders
***

In [100]:
mapList.sum()

285

In [101]:
mapList.max()

81

In [102]:
mapList.min()

1

In [103]:
mapList.mean()

31.666666666666668

## join
***

In [115]:
total_rdd = salary_rdd.join(age_rdd)
total_rdd.collect()

[('Jimmy', (20000, 18)), ('Bob', (23000, 22)), ('Rod', (25000, 29))]