In [1]:
import pyspark
from pyspark import SparkContext
import imageio
import os
import numpy as np

In [2]:
def readImg(path):
    img = imageio.imread(path)
    im = np.array(img,dtype='uint8')
    return im

def writeImg(path,buf):
    imageio.imwrite(path,buf)

def part_median_filter(local_data):
    part_id = local_data[0]
    first   = local_data[1]
    end     = local_data[2]
    buf     = local_data[3]
    nx = buf.shape[0]
    ny = buf.shape[1]
    
    # CREATE NEW BUF WITH MEDIAN FILTER SOLUTION
    new_buf = np.zeros((end-first-1, ny-2, 3), dtype='uint8')
    
    print("test", buf[1,1])
    # TODO COMPUTE MEDIAN FILTER
    for i in range(first+1, end):
        for j in range(1, ny-1):
            median = np.median(
                (buf[i-1,j-1], buf[i-1,j], buf[i-1,j+1],
                buf[i,j-1], buf[i,j], buf[i,j+1],
                buf[i+1,j-1], buf[i+1,j], buf[i+1,j+1]),
                axis=0)
            imedian = np.array([int(k) for k in median])
            new_buf[i-first-1,j-1,:] = imedian
            
    # RETURN LOCAL IMAGE PART
    
    return part_id,new_buf


In [3]:

    # CREATE SPARKCONTEXT
    sc = SparkContext()
    sc.setLogLevel("ERROR")

    data_dir = '.'
    file = os.path.join(data_dir, 'lena_noisy.jpg')
    img_buf = readImg(file)
    print('SHAPE', img_buf.shape)
    
    nx = img_buf.shape[0]
    ny = img_buf.shape[1]

    # SPLT IMAGES IN NB_PARTITIONS PARTS
    # Taking GHOST CELLS for each partition
    # being the boundaries of the corresponding division
    nb_partitions = 8
    print("NB PARTITIONS : ", nb_partitions)
    data = []
    begin = 0
    rest = nx % nb_partitions
    for ip in range(nb_partitions):
        block_size = int(nx / nb_partitions) + (1 if ip < rest else 0)
        end = min(begin + block_size + 1, nx - 1)
        data.append((ip, begin, end, img_buf))
        begin = end - 1
    
    # PARALLEL MEDIAN FILTER COMPUTATION
    data_rdd = sc.parallelize(data, nb_partitions)
    result_rdd = data_rdd.map(part_median_filter)
    result_data = result_rdd.collect()

    print('CREATING NEW PICTURE FILE')
    new_img_buf = np.zeros((nx, ny, 3), dtype='uint8')
    new_img_buf[:,0,:] = img_buf[:,0,:]
    new_img_buf[:,-1,:] = img_buf[:,-1,:]
    new_img_buf[0,:,:] = img_buf[0,:,:]
    new_img_buf[-1,:,:] = img_buf[-1,:,:]
    
    # COMPUTE NEW IMAGE RESULTS FROM RESULT RDD
    result_data.sort(key=lambda x: x[0]) # sort the results according to their partition Id 
    parts = list(zip(*result_data))[1]
    new_img_buf[1:-1,1:-1,:] = np.concatenate(parts)

    filter_file = os.path.join(data_dir,'lena_filter.jpg')
    writeImg(filter_file, new_img_buf)



SHAPE (128, 128, 3)
NB PARTITIONS :  8
CREATING NEW PICTURE FILE


In [38]:
print("shape",result_data[0][1].shape)
print("shape2",np.concatenate(parts).shape)



shape (16, 126, 3)
shape2 (126, 126, 3)
