In [1]:
# github.com/minrk/findspark
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext(appName="Spark1")

In [2]:
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib.cm as cm
def mandelbrot(x, y):
    z = c = complex(x, y)
    iteration = 0
    max_iteration = 511  # arbitrary cutoff
    while abs(z) < 2 and iteration < max_iteration:
        z = z * z + c
        iteration += 1
    return iteration

def sum_values_for_partitions(rdd):
    'Returns (as an RDD) the sum of V for each partition of a (K, V) RDD'
    # note that the function passed to mapPartitions should return a sequence,
    # not a value.
    return rdd.mapPartitions(lambda part: [sum(V for K, V in part)])

def draw_image(rdd):
    '''Given a (K, V) RDD with K = (I, J) and V = count,
    display an image of count at each I, J'''

    data = rdd.collect()
    I = np.array([d[0][0] for d in data])
    J = np.array([d[0][1] for d in data])
    C = np.array([d[1] for d in data])
    im = np.zeros((I.max() + 1, J.max() + 1))
    im[I, J] = np.log(C + 1)  # log intensity makes it easier to see levels
    plt.imshow(im, cmap=cm.gray)
    plt.show()



In [3]:
#Clean Version
arr = xrange(2000)
rdd_single = sc.parallelize(arr,10)
rdd = rdd_single.cartesian(rdd_single)
#print dir(rdd)

iters = rdd.map(lambda (i,j): ((i,j), mandelbrot(j / 500.0 - 2, i / 500.0 - 2)))
num_per_part = sum_values_for_partitions(iters)
print num_per_part.take(100)
#draw_image(iters)
plt.hist(num_per_part.collect())
#MAKE HISTOGRAM PRETTIER
plt.savefig("P2a_hist.png")

[0, 0, 12530, 30272, 38539, 38559, 30334, 12646, 0, 0, 0, 21703, 44580, 51524, 49220, 40890, 40000, 40000, 21902, 0, 13876, 65497, 81625, 108174, 1164982, 123094, 51270, 40000, 40000, 12729, 53393, 95278, 140141, 4318918, 15667522, 8034422, 108278, 44877, 40000, 30472, 165543, 1853143, 12188031, 16849344, 20440000, 18596168, 224934, 57818, 40000, 38739, 266888, 1954388, 12289026, 16893985, 20440000, 18576975, 224845, 57852, 40000, 38759, 53570, 95833, 140875, 4375631, 15720793, 8117946, 108684, 44943, 40000, 30534, 14042, 65698, 81721, 108507, 1213364, 123809, 51443, 40000, 40000, 12846, 0, 21902, 44751, 51724, 49420, 40961, 40000, 40000, 22102, 0, 0, 0, 12729, 30472, 38739, 38759, 30534, 12846, 0, 0]


In [3]:
#Clean Version RePartitioning
arr = xrange(2000)
rdd_single = sc.parallelize(arr,10)
rdd = rdd_single.cartesian(rdd_single)
rdd = rdd.repartition(100)
iters = rdd.map(lambda (i,j): ((i,j), mandelbrot(j / 500.0 - 2, i / 500.0 - 2)))
#print iters.take(10)
num_per_part = sum_values_for_partitions(iters)
#draw_image(iters)
plt.hist(num_per_part.collect())
#MAKE HISTOGRAM PRETTIER
plt.savefig("P2c_hist.png")

In [3]:
#Clean Version Improved Partitioning
arr = xrange(2000)
rdd_single = sc.parallelize(arr,10)
rdd = rdd_single.cartesian(rdd_single)
rdd = rdd.partitionBy(100)
iters = rdd.map(lambda (i,j): ((i,j), mandelbrot(j / 500.0 - 2, i / 500.0 - 2)))
#print iters.take(10)
num_per_part = sum_values_for_partitions(iters)
#draw_image(iters)
plt.hist(num_per_part.collect())
#MAKE HISTOGRAM PRETTIER
plt.savefig("P2b_hist.png")