In [1]:
import findspark
findspark.init()

import pyspark

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, HiveContext, Row
from pyspark.storagelevel import StorageLevel
from pyspark.streaming import StreamingContext

def create_sc(app_name):
    sc_conf = SparkConf()
    sc_conf.setAppName(app_name)
    sc_conf.setMaster('local[*]')
    sc_conf.set('spark.executor.memory', '4g')
    sc_conf.set('spark.executor.cores', '4')
    sc_conf.set('spark.driver.memory', '32G')
    sc_conf.set('spark.cores.max', '32')
    sc_conf.set('spark.driver.maxResultSize', '10G')
    sc_conf.set('spark.logConf', True)
    print(sc_conf.getAll())

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc

sc = create_sc("TestCube")
print('CONTEXT INFO : ',sc.version,sc._conf.get('spark.driver.memory'))

dict_items([('spark.app.name', 'TestCube'), ('spark.master', 'local[*]'), ('spark.executor.memory', '4g'), ('spark.executor.cores', '4'), ('spark.driver.memory', '32G'), ('spark.cores.max', '32'), ('spark.driver.maxResultSize', '10G'), ('spark.logConf', 'True')])
CONTEXT INFO :  2.2.0.2.6.3.0-235 32G


In [None]:
import os
import glob
import numpy as np
import time
from struct import unpack_from

dataFolder = '/user/gratienj/Data/TIM'
filename = os.path.join(dataFolder,'binary_file4')
t0 = time.process_time()
record_length = 1
binary_rdd = sc.binaryRecords(filename, record_length)
# map()s each binary record to unpack() it
unpacked_rdd = binary_rdd.map(lambda record: unpack_from('b', record))
raw_data = unpacked_rdd.collect()
t1=time.process_time()-t0
print("HDFS READING TIME : ",t1)


In [None]:
localDataFolder='/home/gratienj/Data/TIM'
local_filename=os.path.join(localDataFolder,'binary_file4')

t0 = time.process_time()
with open(local_filename, mode='rb') as file:
    bytes_data = file.read()
    array_data = np.frombuffer(bytes_data,dtype='b',count=bytes_data.__len__())
t1=time.process_time()-t0
print("Local FS READING TIME : ",t1)
print('NB PIXELS :',len(array_data))

In [None]:
record_length=1
t0 = time.process_time()
binary_rdd = sc.binaryRecords(filename, record_length)
# map()s each binary record to unpack() it
unpacked_rdd = binary_rdd.map(lambda record: unpack_from('1b', record)).flatMap(lambda x:x)
raw_data = unpacked_rdd.collect()
t1=time.process_time()-t0
print("HDFS READING TIME : ",t1)
print('NB PIXELS :',len(raw_data))

record_length=2
t0 = time.process_time()
binary_rdd = sc.binaryRecords(filename, record_length)
# map()s each binary record to unpack() it
unpacked_rdd = binary_rdd.map(lambda record: unpack_from('2b', record)).flatMap(lambda x:x)
raw_data = unpacked_rdd.collect()
t1=time.process_time()-t0
print("HDFS READING TIME : ",t1)
print('NB PIXELS :',len(raw_data))
record_length=4
t0 = time.process_time()
binary_rdd = sc.binaryRecords(filename, record_length)
# map()s each binary record to unpack() it
unpacked_rdd = binary_rdd.map(lambda record: unpack_from('4b', record)).flatMap(lambda x:x)
raw_data = unpacked_rdd.collect()
t1=time.process_time()-t0
print("HDFS READING TIME : ",t1)
print('NB PIXELS :',len(raw_data))
record_length=8
t0 = time.process_time()
binary_rdd = sc.binaryRecords(filename, record_length)
# map()s each binary record to unpack() it
unpacked_rdd = binary_rdd.map(lambda record: unpack_from('8b', record)).flatMap(lambda x:x)
raw_data = unpacked_rdd.collect()
t1=time.process_time()-t0
print("HDFS READING TIME : ",t1)
print('NB PIXELS :',len(raw_data))

## TEST MONGODB

In [None]:
import pymongo
import gridfs
from pymongo import MongoClient

client = MongoClient('mongodb://%s:%s@islin-hdpnod1.ifp.fr' % ("gratienj", "gratienj2019!"))

#
# GETTING a Database
db = client['tim-cube']
grid_fs = gridfs.GridFS(db,'bdata')

# GETTING a collection
cube_collection = db['cube-collection1']

In [None]:
def create_cube(nx,ny,nz):
    random_array = np.random.randint(0,255,size=(nx,ny,nz))
    return np.array(random_array,dtype=np.uint8)

def writeCubeToDataBase(idx,cube,nx,ny,nz,collection,gfs):
        bytes_data = cube_data.tobytes()
        img_data_id = gfs.put(bytes_data)
        name="cube-"+str(nx)+"x"+str(ny)+"x"+str(nz)
        img_doc = {"name":name,'id':idx,'data_id':img_data_id,'shape':{'nx':nx,'ny':ny,'nz':nz}}
        img_id = collection.insert_one(img_doc).inserted_id
        print('add to DB IMG DOC : ',name," id=",img_id)    
        

In [None]:
sizes=[10,20,50,100,200,400,800]
idx=0
for nx in sizes:
    print("CUBE : ",nx)
    cube_data = create_cube(nx,nx,nx)
    t0 = time.process_time()
    writeCubeToDataBase(idx,cube_data,nx,nx,nx,cube_collection,grid_fs)
    t1=time.process_time()-t0
    print("MONGODB WRITTING TIME : ",t1)
    idx=idx+1
    

In [None]:
def getCubeFromDataBase(name,collection,gfs):
    #import pprint
    cube = collection.find_one({"name":name})
    #pprint.pprint(img)
    print('ID:',cube['id'])
    shape=cube['shape']
    #print('SHAPE',shape['nx'],shape['ny'],shape['nz'])
    data_file = gfs.find_one({"_id":cube['data_id']})
    bytes_data = data_file.read()
    print('NB BYTES',bytes_data.__len__(),np.dtype('uint8').itemsize)
    array_data = np.frombuffer(bytes_data,dtype='uint8',count=int(bytes_data.__len__()/np.dtype('uint8').itemsize))
    array3D_data =np.reshape(array_data,(shape['nx'],shape['ny'],shape['nz']))
    return array3D_data

def deleteCubeFrmDataBase(name,collection,gfs):
    for cube in collection.find({"name":name}):
        print('ID:',cube['id'])
        shape=cube['shape']
        #print('SHAPE',shape['nx'],shape['ny'],shape['nz'])
        gfs.delete_many({"_id":cube['data_id']})
        collection.delete_one({"_id":cube['_id']})

In [None]:
sizes=[10,20,50,100,200,400,800]
for nx in sizes:
    print("CUBE : ",nx)
    name="cube-"+str(nx)+"x"+str(nx)+"x"+str(nx)
    cube_data=getCubeFromDataBase(name,cube_collection,grid_fs)
print("COLLECTION SIZE",cube_collection.count())

In [None]:
#grid_fs.delete_many({})
cube_collection.delete_many({})

In [None]:
sizes=[10,20,50,100,200,400,800]
idx=0
for nx in sizes:
    print("CUBE : ",nx)
    name="cube-"+str(nx)+"x"+str(nx)+"x"+str(nx)
    t0 = time.process_time()
    cube_data=getCubeFromDataBase(name,cube_collection,grid_fs)
    t1=time.process_time()-t0
    print("MONGODB READ TIME : ",t1)
    print('SHAPE :',cube_data.shape)
    idx=idx+1