In [1]:
import os

import numpy as np
import tensorflow as tf
from sklearn.datasets import load_files

import autokeras as ak

In [2]:
dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz",
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    extract=True,
)

# set path to dataset
IMDB_DATADIR = os.path.join(os.path.dirname(dataset), "aclImdb")

classes = ["pos", "neg"]
train_data = load_files(
    os.path.join(IMDB_DATADIR, "train"), shuffle=True, categories=classes
)
test_data = load_files(
    os.path.join(IMDB_DATADIR, "test"), shuffle=False, categories=classes
)

x_train = np.array(train_data.data)
y_train = np.array(train_data.target)
x_test = np.array(test_data.data)
y_test = np.array(test_data.target)

print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # this film was just brilliant casting

(25000,)
(25000,)
b'Zero Day leads you to think, even re-think why two'


In [1]:
from cerebro.backend import SparkBackend
from cerebro.keras import SparkEstimator

# datas storage for intermediate data and model artifacts.
from cerebro.storage import LocalStore, HDFSStore

# Model selection/AutoML methods.
from cerebro.tune import GridSearch, RandomSearch, TPESearch

# Utility functions for specifying the search space.
from cerebro.tune import hp_choice, hp_uniform, hp_quniform, hp_loguniform, hp_qloguniform

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pyspark.sql import SparkSession
import numpy as np
import os

# os.environ["PYSPARK_PYTHON"] = '/usr/bin/python3.6'
# os.environ["PYSPARK_DRIVER_PYTHON"] = '/usr/bin/python3.6'

from pyspark import SparkConf

spark = SparkSession \
    .builder \
    .appName("Cerebro Example") \
    .getOrCreate()

...
work_dir = '/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/'
backend = SparkBackend(spark_context=spark.sparkContext, num_workers=1)
store = LocalStore(prefix_path=work_dir + 'test/')

21/12/07 03:31:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


CEREBRO => Time: 2021-12-07 03:31:44, Running 1 Workers


In [4]:
train_data = np.stack([x_train, y_train], axis=1)
test_data = np.stack([x_test, y_test], axis=1)
all_data = np.concatenate([train_data, test_data], axis=0)
all_data.shape

(50000, 2)

In [5]:
all_data[0,0]

b"Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty."

In [6]:
dff = map(lambda x: (x[0].decode('UTF-8') ,int(x[1])), all_data)
mydf = spark.createDataFrame(dff,schema=["features", "label"])

from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator(dropLast=False)
encoder.setInputCols(["label"])
encoder.setOutputCols(["label_OHE"])

encoder_model = encoder.fit(mydf)
encoded = encoder_model.transform(mydf)

feature_columns=['features']
label_columns=['label_OHE']

encoded.show(5)

21/12/07 01:36:05 WARN TaskSetManager: Stage 0 contains a task of very large size (5408 KB). The maximum recommended task size is 100 KB.
                                                                                

+--------------------+-----+-------------+
|            features|label|    label_OHE|
+--------------------+-----+-------------+
|Zero Day leads yo...|    1|(2,[1],[1.0])|
|Words can't descr...|    0|(2,[0],[1.0])|
|Everyone plays th...|    1|(2,[1],[1.0])|
|There are a lot o...|    0|(2,[0],[1.0])|
|I've just had the...|    0|(2,[0],[1.0])|
+--------------------+-----+-------------+
only showing top 5 rows



21/12/07 01:36:06 WARN TaskSetManager: Stage 2 contains a task of very large size (5408 KB). The maximum recommended task size is 100 KB.


In [7]:
encoded.write.save("/Users/zijian/Desktop/ucsd/cse234/project/imdb/imdb.parquet")

21/12/07 01:36:10 WARN TaskSetManager: Stage 3 contains a task of very large size (5408 KB). The maximum recommended task size is 100 KB.
[Stage 3:>                                                        (0 + 12) / 12]21/12/07 01:36:11 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/12/07 01:36:11 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/12/07 01:36:11 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 75.08% for 9 writers
21/12/07 01:36:11 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 67.58% for 10 writers
21/12/07 01:36:11 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 61.43% for 11 writers
21/12/07 01:36:

In [2]:
feature_columns=['features']
label_columns=['label_OHE']
df = spark.read.load("/Users/zijian/Desktop/ucsd/cse234/project/imdb/imdb.parquet")
df = df.select(feature_columns+label_columns)
df.show(5)
train_df, test_df = df.randomSplit([0.8, 0.2], seed=100)

+--------------------+-------------+
|            features|    label_OHE|
+--------------------+-------------+
|Lauren Himmel's d...|(2,[1],[1.0])|
|The Pickle was th...|(2,[1],[1.0])|
|This is a great o...|(2,[1],[1.0])|
|I liked this show...|(2,[1],[1.0])|
|I have watched Fa...|(2,[1],[1.0])|
+--------------------+-------------+
only showing top 5 rows



In [3]:
from keras_tuner.engine import hyperparameters
import autokeras as ak
from cerebro.nas.hphpmodel import HyperHyperModel

input_node = ak.TextInput()
output_node = ak.TextBlock(block_type="ngram")(input_node)
output_node = ak.ClassificationHead(num_classes=2, multi_label=True)(output_node)
am = HyperHyperModel(input_node, output_node, seed=2000)

am.resource_bind(
    backend=backend, 
    store=store,
    feature_columns=feature_columns,
    label_columns=label_columns,
    evaluation_metric='accuracy', 
)

am.tuner_bind(
    tuner="greedy", 
#     tuner="randomsearch",
    hyperparameters=None, 
    objective="val_accuracy",
    max_trials=2,
    overwrite=True,
    exploration=0.3,
)

In [4]:
rel = am.fit(train_df,epochs=2)

CEREBRO => Time: 2021-12-07 03:31:47, Preparing Data
CEREBRO => Time: 2021-12-07 03:31:47, Num Partitions: 12
CEREBRO => Time: 2021-12-07 03:31:47, Writing DataFrames
CEREBRO => Time: 2021-12-07 03:31:47, Train Data Path: file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/test/intermediate_train_data
CEREBRO => Time: 2021-12-07 03:31:47, Val Data Path: file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/test/intermediate_val_data


                                                                                

CEREBRO => Time: 2021-12-07 03:31:50, Train Partitions: 9


                                                                                

CEREBRO => Time: 2021-12-07 03:32:00, Val Partitions: 2


                                                                                

CEREBRO => Time: 2021-12-07 03:32:12, Train Rows: 32056
CEREBRO => Time: 2021-12-07 03:32:12, Val Rows: 7914
CEREBRO => Time: 2021-12-07 03:32:12, Initializing Workers
CEREBRO => Time: 2021-12-07 03:32:12, Initializing Data Loaders


2021-12-07 03:32:22.426482: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-07 03:32:22.426697: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.



Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
text_block_1/ma...|5000              |?                 
text_block_1/te...|2                 |?                 
text_block_1/de...|True              |?                 
text_block_1/de...|1                 |?                 
text_block_1/de...|256               |?                 
text_block_1/de...|0.25              |?                 
text_block_1/de...|256               |?                 
classification_...|0                 |?                 
optimizer         |adam              |?                 
learning_rate     |0.001             |?                 



2021-12-07 03:32:22.925167: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[Stage 9:>                                                          (0 + 1) / 1]

NotImplementedError: Save or restore weights that is not an instance of `tf.Variable` is not supported in h5, use `save_format='tf'` instead. Got a model or layer TextVectorization with weights [<tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x180882a50>, <tf.Variable 'idf:0' shape=(5000,) dtype=float32, numpy=
array([2.3978953, 0.6931472, 0.6466272, ..., 2.3978953, 2.3978953,
       2.3978953], dtype=float32)>]

[Stage 9:>                                                          (0 + 1) / 1]

In [6]:
rel

<cerebro.tune.ModelSelectionResult at 0x1812ad810>

In [11]:
from cerebro.backend.spark.util import _get_metadata

unischema_fields = []
metadata = _get_metadata(train_df)
for k in metadata.keys():
    type = spark_to_petastorm_type(metadata[k]['spark_data_type'])
    shape = petastorm_unischema_shape(metadata[k]['shape'])
    codec = petastorm_unischema_codec(metadata[k]['shape'], metadata[k]['spark_data_type'])
    unischema_fields.append(UnischemaField(k, type, shape, codec, False))

                                                                                

In [12]:
unischema_fields

[UnischemaField(name='features', numpy_dtype=<class 'numpy.uint8'>, shape=(), codec=<petastorm.codecs.ScalarCodec object at 0x17b1d1190>, nullable=False),
 UnischemaField(name='label', numpy_dtype=<class 'numpy.int64'>, shape=(), codec=<petastorm.codecs.ScalarCodec object at 0x17b1d11d0>, nullable=False),
 UnischemaField(name='label_OHE', numpy_dtype=<class 'numpy.float64'>, shape=(2,), codec=<petastorm.codecs.NdarrayCodec object at 0x17b1d10d0>, nullable=False)]

In [13]:
metadata

{'features': {'spark_data_type': pyspark.sql.types.StringType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'label': {'spark_data_type': pyspark.sql.types.LongType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'label_OHE': {'spark_data_type': pyspark.ml.linalg.SparseVector,
  'is_sparse_vector_only': True,
  'shape': 2,
  'intermediate_format': 'custom_sparse_format',
  'max_size': 1}}

In [14]:
from petastorm.unischema import Unischema, UnischemaField, dict_to_spark_row
petastorm_schema = Unischema('petastorm_schema', unischema_fields)
print(petastorm_schema)

Unischema(petastorm_schema, [
  UnischemaField('features', uint8, (), <petastorm.codecs.ScalarCodec object at 0x17b1d1190>, False),
  UnischemaField('label', int64, (), <petastorm.codecs.ScalarCodec object at 0x17b1d11d0>, False),
  UnischemaField('label_OHE', float64, (2,), <petastorm.codecs.NdarrayCodec object at 0x17b1d10d0>, False),
])


In [15]:
print(train_df.rdd \
  .map(lambda x: x.asDict()) \
  .take(1))

[{'features': '"... the beat is too strong ... we\'re deaf mutants now--like them", Rex Voorhas Ormine<br /><br />I am surprised that this movie has been uniformly bashed. Let me be the first to actually discuss the virtues of "The Beat" and why YOU MUST SEE THIS FILM NOW.<br /><br />Make no mistake, this movie is cheesy and "bad" in the conventional sense: the story is preposterous, the poetry is silly, and the acting is inconsistent.<br /><br />But these are the film\'s CHARMS--all of these ingredients form the recipe for one of the most UNDERAPPRECIATED CHEEZY FILMS of the 80\'s.<br /><br />If the reference to "deaf mutants" didn\'t pique your interest, then perhaps this will: What kind of name is "Rex Voorhas Ormine", anyway? It is such an unusual name (for North American audiences) that I said to myself, "even the names of the characters in this friggin\' movie are firggin\' silly."<br /><br />Well, "The Beat" is so fabulously cheezy that the "meaning" and "symbolism" behind "Rex 

In [16]:
print(train_df.rdd \
      .map(lambda x: x.asDict()) \
      .map(lambda x: {k: create_array(x,k,spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
      .take(1))

[Stage 8:>                                                          (0 + 1) / 1]

[{'features': array([34, 46, 46, ..., 32, 47, 62], dtype=uint8), 'label': array(1), 'label_OHE': array([0., 1.])}]


                                                                                

In [17]:
def map_to_np(x,k,dtype):
    if dtype == np.uint8:
        return x[k]
    else:
        return np.array(x[k], dtype=dtype)

In [18]:
print(train_df.rdd \
      .map(lambda x: x.asDict()) \
      .map(lambda x: {k: map_to_np(x,k,spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
      .take(1))

[Stage 9:>                                                          (0 + 1) / 1]

[{'features': '"... the beat is too strong ... we\'re deaf mutants now--like them", Rex Voorhas Ormine<br /><br />I am surprised that this movie has been uniformly bashed. Let me be the first to actually discuss the virtues of "The Beat" and why YOU MUST SEE THIS FILM NOW.<br /><br />Make no mistake, this movie is cheesy and "bad" in the conventional sense: the story is preposterous, the poetry is silly, and the acting is inconsistent.<br /><br />But these are the film\'s CHARMS--all of these ingredients form the recipe for one of the most UNDERAPPRECIATED CHEEZY FILMS of the 80\'s.<br /><br />If the reference to "deaf mutants" didn\'t pique your interest, then perhaps this will: What kind of name is "Rex Voorhas Ormine", anyway? It is such an unusual name (for North American audiences) that I said to myself, "even the names of the characters in this friggin\' movie are firggin\' silly."<br /><br />Well, "The Beat" is so fabulously cheezy that the "meaning" and "symbolism" behind "Rex 

                                                                                

In [20]:
print(train_df.rdd \
      .map(lambda x: x.asDict()) \
      .map(lambda x: {k: map_to_np(x,k,spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
      .map(lambda x: dict_to_spark_row(petastorm_schema, x))
      .take(1))

[Stage 11:>                                                         (0 + 1) / 1]

[Row(features='"... the beat is too strong ... we\'re deaf mutants now--like them", Rex Voorhas Ormine<br /><br />I am surprised that this movie has been uniformly bashed. Let me be the first to actually discuss the virtues of "The Beat" and why YOU MUST SEE THIS FILM NOW.<br /><br />Make no mistake, this movie is cheesy and "bad" in the conventional sense: the story is preposterous, the poetry is silly, and the acting is inconsistent.<br /><br />But these are the film\'s CHARMS--all of these ingredients form the recipe for one of the most UNDERAPPRECIATED CHEEZY FILMS of the 80\'s.<br /><br />If the reference to "deaf mutants" didn\'t pique your interest, then perhaps this will: What kind of name is "Rex Voorhas Ormine", anyway? It is such an unusual name (for North American audiences) that I said to myself, "even the names of the characters in this friggin\' movie are firggin\' silly."<br /><br />Well, "The Beat" is so fabulously cheezy that the "meaning" and "symbolism" behind "Rex 

                                                                                

In [5]:
from cerebro.backend.spark import util
_, _, meta, avg_row_size = util.get_simple_meta_from_parquet(store, feature_columns+label_columns, None, None)

In [6]:
meta

{'features': {'spark_data_type': pyspark.sql.types.StringType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'label_OHE': {'spark_data_type': pyspark.sql.types.BinaryType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None}}

[Stage 9:>                                                          (0 + 1) / 1]