In [1]:
from cerebro.backend import SparkBackend
from cerebro.keras import SparkEstimator

# datas storage for intermediate data and model artifacts.
from cerebro.storage import LocalStore, HDFSStore

# Model selection/AutoML methods.
from cerebro.tune import GridSearch, RandomSearch, TPESearch

# Utility functions for specifying the search space.
from cerebro.tune import hp_choice, hp_uniform, hp_quniform, hp_loguniform, hp_qloguniform

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pyspark.sql import SparkSession
import numpy as np
import os

os.environ["PYSPARK_PYTHON"] = '/usr/bin/python3.6'
os.environ["PYSPARK_DRIVER_PYTHON"] = '/usr/bin/python3.6'

from pyspark import SparkConf

conf = SparkConf().setAppName('cluster') \
    .setMaster('spark://10.10.1.1:7077') \
    .set('spark.task.cpus', '16') \
    .set('spark.executor.memory', '124g')
spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark.sparkContext.addPyFile("cerebro.zip")

work_dir = '/var/nfs/'
backend = SparkBackend(spark_context=spark.sparkContext, num_workers=6)
store = LocalStore(prefix_path=work_dir + 'test/')

CEREBRO => Time: 2021-12-07 16:51:29, Running 6 Workers


In [2]:
from tensorflow.keras import datasets
from petastorm.codecs import CompressedImageCodec, \
        NdarrayCodec, ScalarCodec
from petastorm.etl.dataset_metadata import materialize_dataset
from petastorm.unischema import Unischema,\
        UnischemaField, dict_to_spark_row
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
def cifar_to_peta():
    MySchema = Unischema('MySchema', [
        UnischemaField('image', np.uint8,
                       (32,32,3), NdarrayCodec(), False),
        UnischemaField('label', np.float32,
                       (10,), NdarrayCodec(), False),
    ])
    (data, labels), _ = datasets.cifar10.load_data()
    labels = keras.utils.to_categorical(labels, 10)
    num_procs = 4 # set the number of parallel processes
    sc = spark.sparkContext
    num_samples = len(labels)
    output_url = 'file:///var/nfs/cifar10/petastorm'
    rowgroup_size_mb = 1024
    def row_generator(i):
        return {
            'image': data[i],
            'label': labels[i],
        }
    # Wrap dataset materialization portion.
    # Will take care of setting up spark environment variables as
    # well as save petastorm specific metadata
    with materialize_dataset(spark, output_url,
                             MySchema, rowgroup_size_mb):
        rows_rdd = sc.parallelize(range(num_samples)) \
            .map(row_generator) \
            .map(lambda x: dict_to_spark_row(MySchema, x))
        spark.createDataFrame(rows_rdd, 
                              MySchema.as_spark_schema()) \
            .write \
            .mode('overwrite') \
            .parquet(output_url)

In [3]:
cifar_to_peta()

In [4]:
(data, labels), _ = datasets.cifar10.load_data()

In [5]:
labels.shape

(50000, 1)

In [6]:
labels = keras.utils.to_categorical(labels, 10)


In [7]:
with open ('/var/nfs/cifar10/prep_np/prep.npy', 'wb') as f:
    np.save(f, data[:100])
    np.save(f, labels[:100])

In [8]:
with open ('/var/nfs/cifar10/prep_np/prep.npy', 'rb') as f:
    x = np.load(f)
    y = np.load(f)

In [9]:
x.shape

(100, 32, 32, 3)

In [10]:
y.shape

(100, 10)