### Finding Prime Numbers

In [None]:
n = 500000
allnumbers = sc.parallelize(xrange(2, n), 8)
composite = allnumbers.flatMap(lambda x: xrange(x*2, n, x))
prime = allnumbers.subtract(composite)
prime.take(10)

In [23]:
# Find the number of elements in each parttion
def partitionsize(it): 
    s = 0
    for i in it:
        s += 1
    yield s

print allnumbers.mapPartitions(partitionsize).collect()
print composite.mapPartitions(partitionsize).collect()
print prime.mapPartitions(partitionsize).collect()
print prime.glom().collect()[1][0:4]

[62499, 62500, 62500, 62500, 62499, 62500, 62500, 62500]
[5216986, 254759, 104166, 62499, 0, 0, 0, 0]
[0, 5169, 1, 5219, 0, 5206, 0, 5189, 0, 5165, 0, 5199, 0, 5191, 0, 5199]
[17, 401537, 462641, 122209]

### Data Partitioning

In [1]:
data = [8, 96, 240, 400, 401, 800]
rdd = sc.parallelize(zip(data, data),4)
rdd = rdd.reduceByKey(lambda x,y: x+y)
# rdd = rdd.sortByKey()
print rdd.glom().collect()
print rdd.partitioner.partitionFunc

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1488877435214_0004,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.
[[(8, 8), (800, 800), (400, 400), (240, 240), (96, 96)], [(401, 401)], [], []]
<function portable_hash at 0x7f153b90d410>

In [17]:
a = sc.parallelize(zip(range(10000), range(10000)), 8)
b = sc.parallelize(zip(range(10000), range(10000)), 8)
a = a.reduceByKey(lambda x,y: x+y)
b = b.reduceByKey(lambda x,y: x+y)
c = a.join(b)
print c.getNumPartitions()
print c.partitioner.partitionFunc
print c.glom().first()[0:4]

8
<function portable_hash at 0x7f153b90d410>
[(0, (0, 0)), (2048, (2048, 2048)), (1432, (1432, 1432)), (5592, (5592, 5592))]

In [None]:
def partitionsize(it): yield len(list(it))
    
n = 40000000

def f(x):
    return x / (n/8)

data1 = range(0, n, 16) + range(0, n, 16)
data2 = range(0, n, 8)
rdd1 = sc.parallelize(zip(data1, data2), 8)
# rdd1 = rdd1.partitionBy(8, f)
# rdd2 = rdd1.reduceByKey(lambda x,y: x+y, partitionFunc=f)
rdd2 = rdd1.reduceByKey(lambda x,y: x+y)
rdd2.mapPartitions(partitionsize).collect()

### Threading

In [None]:
import threading
import random

partitions = 5
n = 5000000 * partitions

# use different seeds in different threads and different partitions
# a bit ugly, since mapPartitionsWithIndex takes a function with only index
# and it as parameters
def f1(index, it):
    random.seed(index + 987231)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

def f2(index, it):
    random.seed(index + 987232)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

def f3(index, it):
    random.seed(index + 987233)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0
    
def f4(index, it):
    random.seed(index + 987234)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0
    
def f5(index, it):
    random.seed(index + 987245)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

f = [f1, f2, f3, f4, f5]
    
# the function executed in each thread/job
def dojob(i):
    count = sc.parallelize(xrange(1, n + 1), partitions) \
              .mapPartitionsWithIndex(f[i]).reduce(lambda a,b: a+b)
    print "Worker", i, "reports: Pi is roughly", 4.0 * count / n

# create and execute the threads
threads = []
for i in range(5):
    t = threading.Thread(target=dojob, args=(i,))
    threads += [t]
    t.start()

print "All started!"

# wait for all threads to complete
for t in threads:
    t.join()    

print "All done!"