In [12]:
# some imports

import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# get the dataset

fashion_mnist = keras.datasets.fashion_mnist
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()
X_train_full.shape

(60000, 28, 28)

In [3]:
# validation set

X_valid, X_train = X_train_full[:5000] / 255.0, X_train_full[5000:] / 255.0
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test / 255.0
X_train = X_train.astype('float32')
X_valid = X_valid.astype('float32')
X_test = X_test.astype('float32')

In [4]:
from tensorflow.core.example.feature_pb2 import Feature, Features, BytesList, Int64List
from tensorflow.core.example.example_pb2 import Example

def get_example_protobuf(picture, label):
    pic_feature = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(picture).numpy()]))
    label_feature = Feature(int64_list=Int64List(value=[label]))
    features = Features(
        feature={
            "picture": pic_feature,
            "label": label_feature
        })
    return Example(features=features)

file_idx = 1
for start_idx in range(0, 55000, 5000):
    with tf.io.TFRecordWriter("13-data/train" + str(file_idx) + ".tfrecord") as f:
        for pic, label in zip(X_train[start_idx:start_idx+5000], y_train[start_idx:start_idx+5000]):
            ex_pf = get_example_protobuf(pic, label)
            f.write(ex_pf.SerializeToString())
    file_idx += 1
    
with tf.io.TFRecordWriter("13-data/valid.tfrecord") as f:
    for pic, label in zip(X_valid, y_valid):
        ex_pf = get_example_protobuf(pic, label)
        f.write(ex_pf.SerializeToString())
        
with tf.io.TFRecordWriter("13-data/test.tfrecord") as f:
    for pic, label in zip(X_test, y_test):
        ex_pf = get_example_protobuf(pic, label)
        f.write(ex_pf.SerializeToString())
        

In [5]:
train_filepaths = ["13-data/train{}.tfrecord".format(idx+1) for idx in range(11)]
valid_filepaths = "13-data/valid.tfrecord"
test_filepaths = "13-data/test.tfrecord"

In [6]:
# define preprocessing

feature_description = {
    "picture": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "label": tf.io.FixedLenFeature([], tf.int64, default_value=0)
}

def preprocess(serialized_example):
    features = tf.io.parse_single_example(serialized_example, features=feature_description)
    picture = tf.io.parse_tensor(features["picture"], 'float32')
    picture = tf.reshape(picture, [28, 28])
    label = features["label"]
    return picture, label

def tfrecord_reader_dataset(filepaths, repeat=1, n_readers=5, 
                            n_read_threads=None, shuffle_buffer_size=10000,
                            n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(lambda filepath: tf.data.TFRecordDataset(filepath), 
                                 cycle_length=n_readers, num_parallel_calls=1)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)

In [7]:
class Standardization(keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keepdims=True)
    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + K.epsilon())
    
std_layer = Standardization()
std_layer.adapt(X_train)

In [8]:
model = keras.models.Sequential()
model.add(std_layer)
model.add(keras.layers.Flatten(input_shape=[28, 28]))
model.add(keras.layers.Dense(300, activation="relu"))
model.add(keras.layers.Dense(100, activation="relu"))
model.add(keras.layers.Dense(10, activation="softmax"))

In [9]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

In [10]:
train_set = tfrecord_reader_dataset(train_filepaths)
valid_set = tfrecord_reader_dataset(valid_filepaths)
test_set = tfrecord_reader_dataset(test_filepaths)

In [11]:
import os
root_logdir = os.path.join(os.curdir, "my_logs")

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()

In [12]:
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)
history = model.fit(train_set, epochs=30, validation_data=valid_set, callbacks=[tensorboard_cb])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

KeyboardInterrupt: 

# Exercise 10

In [1]:
# some imports

import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import os

train_pos = ["13-data/imdb/train/pos/" + file for file in os.listdir("13-data/imdb/train/pos")]
train_neg = ["13-data/imdb/train/neg/" + file for file in os.listdir("13-data/imdb/train/neg")]
valid_pos = ["13-data/imdb/valid/pos/" + file for file in os.listdir("13-data/imdb/valid/pos")]
valid_neg = ["13-data/imdb/valid/neg/" + file for file in os.listdir("13-data/imdb/valid/neg")]
test_pos = ["13-data/imdb/test/pos/" + file for file in os.listdir("13-data/imdb/test/pos")]
test_neg = ["13-data/imdb/test/neg/" + file for file in os.listdir("13-data/imdb/test/neg")]

In [3]:
def pos_labeler(example):
    return example, tf.constant(1)

def neg_labeler(example):
    return example, tf.constant(0)

In [4]:
def txt_dataset(pos_paths, neg_paths,
                n_readers = 5, n_read_threads=None,
                shuffle_buffer_size=10000,
                batch_size=32):
    pos_dataset = tf.data.Dataset.list_files(pos_paths[:200])
    neg_dataset = tf.data.Dataset.list_files(neg_paths[:200])
    pos_dataset = pos_dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath), 
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    neg_dataset = neg_dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath), 
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    pos_dataset = pos_dataset.map(pos_labeler, num_parallel_calls=n_read_threads)
    neg_dataset = neg_dataset.map(neg_labeler, num_parallel_calls=n_read_threads)
    dataset = pos_dataset.concatenate(neg_dataset)
    dataset = dataset.shuffle(shuffle_buffer_size)
    return dataset.batch(batch_size).prefetch(1)

In [5]:
train_dataset = txt_dataset(train_pos, train_neg)
valid_dataset = txt_dataset(valid_pos, valid_neg)
test_dataset = txt_dataset(test_pos, test_neg)

In [8]:
vocab_generation_dataset_pos = tf.data.Dataset.list_files(train_pos[:500])
vocab_generation_dataset_neg = tf.data.Dataset.list_files(train_neg[:500])
vocab_generation_dataset_pos = vocab_generation_dataset_pos.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath))
vocab_generation_dataset_neg = vocab_generation_dataset_neg.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath))
vocab_generation_dataset = vocab_generation_dataset_pos.concatenate(vocab_generation_dataset_neg).shuffle(50)
print('a')

# most code is from https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/TextVectorization

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_features = 5000  # Maximum vocab size.
max_len = 40  # Sequence length to pad the outputs to.
embedding_dims = 50

# Create the layer.
vectorize_layer = TextVectorization(
  max_tokens=max_features,
  output_mode='int',
  output_sequence_length=max_len)

print('b')
# Now that the vocab layer has been created, call `adapt` on the text-only
# dataset to create the vocabulary. You don't have to batch, but for large
# datasets this means we're not keeping spare copies of the dataset in memory.
vectorize_layer.adapt(vocab_generation_dataset)
print('c')

# Create the model that uses the vectorize text layer
model = tf.keras.models.Sequential()

# Start by creating an explicit input layer. It needs to have a shape of (1,)
# (because we need to guarantee that there is exactly one string input per
# batch), and the dtype needs to be 'string'.
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

# The first layer in our model is the vectorization layer. After this layer,
# we have a tensor of shape (batch_size, max_len) containing vocab indices.
model.add(vectorize_layer)

# Next, we add a layer to map those vocab indices into a space of
# dimensionality 'embedding_dims'. Note that we're using max_features+1 here,
# since there's an OOV token that gets added to the vocabulary in
# vectorize_layer.
model.add(tf.keras.layers.Embedding(max_features+1, embedding_dims))

# At this point, you have embedded float data representing your tokens, and
# can add whatever other layers you need to create your model.

a
b


IndexError: list index out of range

In [6]:
for item in train_dataset.take(3):
    print(item)

(<tf.Tensor: shape=(32,), dtype=string, numpy=
array([b"I've been willing to put up with a lot from late-spring/summer action fluff movies, but in general that's been due to the fact that most of them have reasonable payoff (i.e. cool special effects, interesting plot twists, comic value, Steve Buscemi, etc.). This movie, however, had none of this. All that we got was the cheap thrill of several minutes of Eva Longoria's cleavage (an issue of Maxim is cheaper than a movie ticket). There is an embarrassing lack of plot, suspense, back story, character development, continuity, etc. I would get into specifics, but quite frankly I've already-maybe willingly-forgotten most of the movie.<br /><br />The entire time I was in the theater, I was kicking myself for not just spending the afternoon watching a 24 season on DVD. Save your money on this one, folks. Unless you really, really, really like Eva Longoria's cleavage.",
       b'I read in the papers that W.Snipes was broke so no wonder he wo