In [0]:
import iarray_spark as iaspark
import iarray as ia
import numpy as np
from time import time

In [0]:
shape = (16 * 1024 * 16 * 1024,)
chunks = (2 * 1024 * 2 * 1024,)
blocks = (256 * 256,)
dtype = np.float64

urlpath_a = "iaspark_simple_a.iarr"
urlpath_b = "iaspark_simple_b.iarr"
urlpath_out = "iaspark_simple_out.iarr"

urlpath_dbfs_a = "/dbfs/iaspark_simple_a.iarr"
urlpath_dbfs_b = "/dbfs/iaspark_simple_b.iarr"

ia.remove_urlpath(urlpath_a)
ia.remove_urlpath(urlpath_b)
ia.remove_urlpath(urlpath_out)

ia.set_config_defaults(chunks=chunks, blocks=blocks, dtype=dtype, contiguous=True)

In [0]:
ia.arange(shape, urlpath=urlpath_a)
ia.arange(shape, urlpath=urlpath_b)

import shutil

shutil.copyfile(urlpath_a, urlpath_dbfs_a)
shutil.copyfile(urlpath_b, urlpath_dbfs_b)

In [0]:
expr = "tan(a + 2 * sin(b))"
expr = "2 * a + b"

In [0]:
# Now, it is using views (iterchunk engine), in order to use the iterblosc engine: 
#
# 1. Create an array from a chunk (chunk-array)
# 2. Eval using iterblosc
# 3. Serialize the chunk-array and send it to the server
# 4. Create an empty array and update the chunks
#
# or
#
# 1. Create an array from a chunk (chunk-array)
# 2. Eval using iterblosc

n = 3

t0 = time()
for i in range(n):
    print(i)
    res = iaspark.eval_split(expr, {"a": urlpath_a, "b": urlpath_b}, spark_context=sc, urlpath=urlpath_out)
t1 = time()

print(f"iarray optimized split array: {(t1 - t0)/n:.4f} s")

ia.remove_urlpath(urlpath_out)

t0 = time()
for i in range(n):
    print(i)
    res = iaspark.eval_opt(expr, {"a": urlpath_dbfs_a, "b": urlpath_dbfs_b}, spark_context=sc, urlpath=urlpath_out)
t1 = time()

print(f"iarray optimized shared filesystem: {(t1 - t0)/n:.4f} s")
ia.remove_urlpath(urlpath_out)



In [0]:
t0 = time()

a = ia.open(urlpath_dbfs_a)
b = ia.open(urlpath_dbfs_b)

res = ia.expr_from_string(expr, {"a": a, "b": b}).eval()
t1 = time()

print(t1 - t0)

In [0]:
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import os
import shutil
from time import time


nelem = 16 * 16 * 1024 * 1024
chunks = 2 * 2 * 1024 * 1024
dtype = np.int64

urlpath_parquet = "/bench.parquet"
urlpath_dbfs_parquet = "/dbfs/bench.parquet"
urlpath_dbfs_out = "/bench_out.parquet"

try:
    os.remove(urlpath_parquet)
    dbutils.fs.rm(urlpath_dbfs_parquet, True)
    dbutils.fs.rm(urlpath_dbfs_out, True)
except:
    pass

np_arr = np.arange(nelem, dtype=dtype)
pa_table = pa.table({"data": np_arr})
pq.write_table(pa_table, row_group_size=chunks, where=urlpath_parquet)
shutil.copyfile(urlpath_parquet, urlpath_dbfs_parquet)


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

n = 3

t0 = time()
for i in range(n):
    print(i)
    spark = SparkSession.builder \
        .master('local') \
        .appName('myAppName') \
        .getOrCreate()

    parquetFile = spark.read.parquet(urlpath_parquet)

    parquetFile = parquetFile.withColumn("data", col("data") * lit(2))

    parquetFile.write.parquet(urlpath_out)
    try:
        dbutils.fs.rm(urlpath_out, True)
    except:
        pass
t1 = time()

print(f"time parquet: {(t1 - t0)/n:.4f}")

try:
    os.remove(urlpath_parquet)
    dbutils.fs.rm(urlpath_dbfs_parquet, True)
    dbutils.fs.rm(urlpath_out, True)
except:
    pass
