In [1]:
from pyspark import SparkContext
from pyspark import SparkConf
import io
from tifffile import TiffFile 
import numpy as np
import zipfile
from PIL import Image

In [2]:
conf = SparkConf().setAppName("SatelliteProject").setMaster("local")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [3]:
#file_path = "/Users/Manu/Desktop/Classes/Big Data/BDA assignment2/a2_small_sample/"

In [3]:
rdd = sc.binaryFiles('/Users/jatingarg/Desktop/Satellite_BigData/a2_small_sample/')

In [4]:
def getNameFromPath(path):
    return path.split('/')[-1]

In [5]:
def getOrthoTif(zfBytes):
#given a zipfile as bytes (i.e. from reading from a binary file),
# return a np array of rgbx values for each pixel
    bytesio = io.BytesIO(zfBytes)
    zfiles = zipfile.ZipFile(bytesio, "r")
 #find tif:
    for fn in zfiles.namelist():
        if fn[-4:] == '.tif':#found it, turn into array:
            tif = TiffFile(io.BytesIO(zfiles.open(fn).read()))
    return tif.asarray()

In [6]:
# def getChunks(input_tuple):
#     name,image = input_tuple
#     res = list()
#     count = 0
#     imageArray = np.array(image)
#     for i in range(0,len(imageArray),len(imageArray)//5):
#         for j in range(0,len(imageArray[0]),len(imageArray[0])//5):
#             tempArr = [[0 for i in range(500)] for j in range(500)]
#             for p in range(i,i+500,1):
#                 for q in range(j,j+500,1):
#                     tempArr[p%500][q%500] = imageArray[p][q]
#             res.append((name+"-"+str(count),tempArr))
#             count += 1
#     return res

In [7]:
def getChunks(input_tuple):
    name,image = input_tuple
    res = list()
    count = 0
    image_arr = np.array(image)
    for r in range(0,2500, 500):
        for c in range(0,2500, 500):
            window = image_arr[r:r+500,c:c+500]
            res.append((name+"-"+str(count),window))
            count += 1
    return res

In [8]:
# converting RDD into (filename, tiffarray)
rdd_array = rdd.map(lambda x : (x[0], getOrthoTif(x[1]))).map(lambda x :(getNameFromPath(x[0]), x[1])) 

In [9]:
# convert into smaller chunk and make a flat map
rdd_small_image = rdd_array.map(lambda x: getChunks(x)).flatMap(lambda x : x)

In [10]:
lis= rdd_small_image.collect()
test_id = ['3677454_2025195.zip-0', '3677454_2025195.zip-1', '3677454_2025195.zip-18', '3677454_2025195.zip-19']
for tup in lis:
    if tup[0] in test_id:
        print (tup[0],"---->",tup[1][0][0])
        
# [114, 111, 109, 114]
#[ 54,  53,  57, 117]
#[ 79,  70,  66, 123]
#[ 61,  57,  63,  84]
# 3677454_2025195.zip-0 [114 111 109 114]
# 3677454_2025195.zip-1 [ 54  53  57 117]
# 3677454_2025195.zip-18 [ 79  70  66 123]
# 3677454_2025195.zip-19 [61 57 63 84]

3677454_2025195.zip-0 ----> [114 111 109 114]
3677454_2025195.zip-1 ----> [ 54  53  57 117]
3677454_2025195.zip-18 ----> [ 79  70  66 123]
3677454_2025195.zip-19 ----> [61 57 63 84]


## question 2

In [11]:
def makeSingleValue(arr):
    res = list()
    for row in arr:
        temp = list()
        for element in row:
            r = element[0]
            g = element[1]
            b = element[2]
            I = element[3]
            rgbMean = (r+g+b)/3
            temp.append (int(rgbMean * (I/100) ))
        res.append(temp)
    return res

In [12]:
def makeFactor10(image, factor):
    image = np.array(image)
    res = [[0 for i in range(len(image[0])//factor)] for j in range(len(image)//factor)]
    for r in range(0,500, factor):
        for c in range(0,500, factor):
            temp = image[r:r+factor , c:c+factor]
            res[r//factor][c//factor] = np.mean(temp)
    return res

In [13]:
def rowDiff(image):
    res = [[0 for j in range(len(image[0])-1)] for i in range(len(image))]
    for i in range(len(image)):
        for j in range(len(image[0]) - 1):
            val = image[i][j+1] - image[i][j]
            if val < -1:
                val = -1
            elif val > 1:
                val = 1
            else:
                val = 0
            res[i][j] = val
    return res

In [14]:
def colDiff(image):
    res = [ [0 for j in range(len(image[0]))] for i in range(len(image)-1)]
    for i in range(len(image)-1):
        for j in range(len(image[0])):
            val = image[i+1][j] - image[i][j]
            if val < -1:
                val = -1
            elif val > 1:
                val = 1
            else:
                val = 0
            res[i][j] = val
    return res

In [15]:
def makeFeature(img1, img2):
    res = list()
    for i in range(len(img1)):
        for j in range(len(img1[0])):
            res.append(img1[i][j])
        
    for i in range(len(img2)):
        for j in range(len(img2[0])):
            res.append(img2[i][j])
    return res

In [16]:
rdd3 = rdd_small_image.map(lambda x : (x[0], makeSingleValue(x[1]) ))
rdd4 = rdd3.map(lambda x : (x[0], makeFactor10(x[1],10) ))
rdd5 = rdd4.map(lambda x : (x[0],rowDiff(x[1]),colDiff(x[1])))
rdd6 = rdd5.map(lambda x : ( x[0], makeFeature( x[1] , x[2] ) ) )

In [17]:
second_list = rdd6.collect()

In [18]:
testlis = ['3677454_2025195.zip-1', '3677454_2025195.zip-18']
for tup in second_list:
    if tup[0] in testlis:
        print (tup[0], tup[1])

3677454_2025195.zip-1 [0, 1, -1, 1, 1, -1, 0, -1, 0, -1, -1, 1, 1, 1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, 1, 0, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, 0, 1, 1, 0, 0, -1, 1, 1, 1, -1, 0, -1, 1, -1, 1, -1, 1, 0, 0, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, 1, -1, 1, -1, 0, -1, 1, 1, 1, 1, 0, -1, 1, -1, -1, 1, 1, -1, 1, 0, 1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, 1, 1, 0, -1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 0, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1,