In [1]:
from cerebro.backend import SparkBackend
from cerebro.keras import SparkEstimator

# datas storage for intermediate data and model artifacts.
from cerebro.storage import LocalStore, HDFSStore

# Model selection/AutoML methods.
from cerebro.tune import GridSearch, RandomSearch, TPESearch

# Utility functions for specifying the search space.
from cerebro.tune import hp_choice, hp_uniform, hp_quniform, hp_loguniform, hp_qloguniform

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pyspark.sql import SparkSession
import numpy as np
import os

os.environ["PYSPARK_PYTHON"] = '/usr/bin/python3.6'
os.environ["PYSPARK_DRIVER_PYTHON"] = '/usr/bin/python3.6'

from pyspark import SparkConf

conf = SparkConf().setAppName('cluster') \
    .setMaster('spark://10.10.1.1:7077') \
    .set('spark.task.cpus', '16') \
    .set('spark.executor.memory', '124g')
spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark.sparkContext.addPyFile("cerebro.zip")

work_dir = '/var/nfs/'
backend = SparkBackend(spark_context=spark.sparkContext, num_workers=6)
store = LocalStore(prefix_path=work_dir + 'test/')

CEREBRO => Time: 2021-12-06 05:56:59, Running 6 Workers


In [2]:
from keras_tuner.engine import hyperparameters
import autokeras as ak
from cerebro.nas.hphpmodel import HyperHyperModel

# Define the search space
input_node = ak.StructuredDataInput()
otuput_node = ak.DenseBlock()(input_node)
output_node = ak.ClassificationHead()(otuput_node)

am = HyperHyperModel(input_node, output_node, seed=2500)

am.resource_bind(
    backend=backend, 
    store=store,
    feature_columns=["features"],
    label_columns=['labels'],
    evaluation_metric='accuracy', 
)

am.tuner_bind(
    tuner="greedy", 
    hyperparameters=None, 
    objective="val_accuracy",
    max_trials=20,
    overwrite=True,
    exploration=0.3,
)

In [4]:
train_df = spark.read.parquet(work_dir+"limit/criteo/train.parquet")
train_df.show()

+--------------------+------+
|            features|labels|
+--------------------+------+
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[0, 1]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 1...|[1, 0]|
|[0.0, 0.0, 1.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[0, 1]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 1...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 1.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
|[0.0, 0.0, 1.0, 0...|[1, 0]|
|[0.0, 0.0, 0.0, 0...|[1, 0]|
+--------------------+------+
only showing top 20 rows



In [12]:
ms = am.model_selection
train_x = np.array(train_df.select(ms.feature_cols).head(10))
train_y = np.array(train_df.select(ms.label_cols).head(10))
print(train_x.shape)
print(train_y.shape)

(10, 1, 7306)
(10, 1, 2)


In [3]:
ms = am.model_selection

_, _, metadata, _ = ms.backend.get_metadata_from_parquet(ms.store, ms.label_cols, ms.feature_cols)
ms.backend.initialize_workers()
ms.backend.initialize_data_loaders(ms.store, None, ms.feature_cols + ms.label_cols)
train_reader, val_reader = ms.backend.data_readers_fn(40)



In [4]:
from petastorm.tf_utils import make_petastorm_dataset
dataset = make_petastorm_dataset(train_reader)#.unbatch()

In [10]:
dataset = dataset.repeat(10)

In [3]:
df = spark.read.parquet('/var/nfs/test/intermediate_train_data/part-00041-bcd97a35-79dc-4b42-bbdb-7165a94e9039-c000.snappy.parquet')

In [5]:
df = df.limit(100)
df.show()

+--------------------+--------------------+
|            features|              labels|
+--------------------+--------------------+
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 4D 50 5...|
|[93 4E 55 4D 50 5...|[93 4E 55 

In [19]:
pddf = df.toPandas()
pddf.head(1)

Unnamed: 0,features,labels
0,"[147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 3...","[147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 3..."


In [20]:
npdf = np.array(pddf)
npdf.shape

(100, 2)