In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os

##13.1 데이터 API

In [None]:
X = tf.range(10)
_dataset = tf.data.Dataset.from_tensor_slices(X)
_dataset

<TensorSliceDataset shapes: (), types: tf.int32>

```python
tf.data.Dataset.range(10)
```

In [None]:
for item in _dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


###13.1.1 연쇄 변환

#### repeat & batch

In [None]:
dataset = _dataset.repeat(3).batch(7, drop_remainder=False) # inplace = False
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [None]:
dataset = _dataset.repeat(3).batch(4, drop_remainder=True).batch(2) # inplace = False
for item in dataset:
    print(item)

tf.Tensor(
[[0 1 2 3]
 [4 5 6 7]], shape=(2, 4), dtype=int32)
tf.Tensor(
[[8 9 0 1]
 [2 3 4 5]], shape=(2, 4), dtype=int32)
tf.Tensor(
[[6 7 8 9]
 [0 1 2 3]], shape=(2, 4), dtype=int32)
tf.Tensor([[4 5 6 7]], shape=(1, 4), dtype=int32)


#### map

In [None]:
square_dataset = _dataset.map(lambda x : x ** 2)
for sq in square_dataset:
    print(sq)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(25, shape=(), dtype=int32)
tf.Tensor(36, shape=(), dtype=int32)
tf.Tensor(49, shape=(), dtype=int32)
tf.Tensor(64, shape=(), dtype=int32)
tf.Tensor(81, shape=(), dtype=int32)


####filter

In [None]:
for ft in _dataset.filter(lambda x : x < 5):
    print(ft)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


####take

In [None]:
for tk in _dataset.take(3):
    print(tk)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)


###13.1.2 데이터 셔플링

####shuffle

In [None]:
dataset = _dataset.repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int32)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int32)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int32)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int32)
tf.Tensor([3 6], shape=(2,), dtype=int32)


#### csv 파일 생성

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


In [None]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [None]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

#### 여러 파일에서 한 줄씩 번갈아 읽기

In [None]:
train_filepaths

['datasets/housing/my_train_00.csv',
 'datasets/housing/my_train_01.csv',
 'datasets/housing/my_train_02.csv',
 'datasets/housing/my_train_03.csv',
 'datasets/housing/my_train_04.csv',
 'datasets/housing/my_train_05.csv',
 'datasets/housing/my_train_06.csv',
 'datasets/housing/my_train_07.csv',
 'datasets/housing/my_train_08.csv',
 'datasets/housing/my_train_09.csv',
 'datasets/housing/my_train_10.csv',
 'datasets/housing/my_train_11.csv',
 'datasets/housing/my_train_12.csv',
 'datasets/housing/my_train_13.csv',
 'datasets/housing/my_train_14.csv',
 'datasets/housing/my_train_15.csv',
 'datasets/housing/my_train_16.csv',
 'datasets/housing/my_train_17.csv',
 'datasets/housing/my_train_18.csv',
 'datasets/housing/my_train_19.csv']

#### list_files

In [None]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)
for path in filepath_dataset:
    print(path)

tf.Tensor(b'datasets/housing/my_train_05.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_16.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_17.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_14.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_10.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_02.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_12.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_19.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_09.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_13.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_15.csv', shape=(), dtype=string)
tf.Ten

#### interleave

#### TextLineDataset

In [None]:
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath),#.skip(5),
    cycle_length=n_readers)
for line in dataset.take(10):
    print(line.numpy())

b'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue'
b'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue'
b'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue'
b'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue'
b'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue'
b'4.5909,16.0,5.475877192982456,1.0964912280701755,1357.0,2.9758771929824563,33.63,-117.71,2.418'
b'2.4792,24.0,3.4547038327526134,1.1341463414634145,2251.0,3.921602787456446,34.18,-118.38,2.0'
b'4.2708,45.0,5.121387283236994,0.953757225433526,492.0,2.8439306358381504,37.48,-122.19,2.67'
b'2.1856,41.0,3.7189873417721517,1.0658227848101265,803.0,2.0329113924050635,32.76,-117.12,1.205'
b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215'


### 13.1.3 데이터 전처리

In [None]:
import pandas as pd

In [None]:
file_path = '/content/sample_data/california_housing_train.csv'
housing = pd.read_csv(file_path)
X_train = housing.iloc[:, :8]
y_train = housing.iloc[:, -1]

In [None]:
X_mean, X_std = X_train.mean().values, X_train.std().values
n_inputs = 8

####decode_csv

####stack

In [None]:
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x-X_mean) / X_std, y

In [None]:
defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
print(defs)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, <tf.Tensor: shape=(0,), dtype=float32, numpy=array([], dtype=float32)>]


In [None]:
f = open(file_path)
lines = f.readlines()

In [None]:
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(5),
    cycle_length=n_readers)
for line in dataset.take(5):
    print(preprocess(line.numpy()))

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([ 62.15784   ,  -0.29252487,  -1.8785055 ,  -1.212307  ,
         0.28846815,  -1.2425393 ,  -1.2154919 , -63.864563  ],
      dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.896], dtype=float32)>)
(<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([ 60.95016  ,  -3.0997527,  -1.86399  ,  -1.2121627,   2.5565612,
        -1.2423905,  -1.2125791, -65.15376  ], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.083], dtype=float32)>)
(<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([ 62.767467 , -10.585693 ,  -1.7748046,  -1.2122576,   2.988353 ,
        -1.2427121,  -1.2139834, -64.10039  ], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.183], dtype=float32)>)
(<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([ 67.10776   ,   2.9825742 ,  -1.5910823 ,  -1.2122772 ,
         0.21966618,  -1.2429751 ,  -1.2149458 , -64.131836  ],
      dtype=float32)>, <tf.Tenso

### 13.1.4 데이터 적재와 전처리를 합치기

####prefetch

In [None]:
def csv_reader_dataset(filepath, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    filepath_dataset = tf.data.Dataset.list_files(filepath).repeat(repeat)
    interleave_dataset = filepath_dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads
    )
    dataset = interleave_dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    return dataset.batch(batch_size).prefetch(1)

In [None]:
dataset = csv_reader_dataset(file_path)
for item in dataset.take(1):
    x, y = item
    print(x.shape, y.shape)

(32, 8) (32, 1)


#### concatenate

####zip

###13.1.6 tf.keras와 데이터셋 사용하기

In [None]:
train_set = csv_reader_dataset(train_filepaths)

##13.2 TFRecord 포맷

####TFRecordWriter

In [None]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second record")

####TFRecordDataset

In [None]:
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


##13.3 입력 특성 전처리

####Lambda

```python
model = keras.models.Sequential([
                                    keras.layers.Lambda(lambda input: (inputs - means) / (stds + eps)),
                                 [...]
])
```

#### BatchNormalization

In [None]:
class Standardization(keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keepdims=True)
    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())

In [None]:
std_layer = Standardization()
std_layer.adapt(np.array(X_train))
std_layer.means_, std_layer.stds_

(array([[-119.56210824,   35.62522471,   28.58935294, 2643.66441176,
          539.41082353, 1429.57394118,  501.22194118,    3.8835781 ]]),
 array([[2.00510743e+00, 2.13727693e+00, 1.25865668e+01, 2.17988295e+03,
         4.21487054e+02, 1.14781920e+03, 3.84509531e+02, 1.90810040e+00]]))

###13.3.1 원-핫 벡터를 사용해 범주형 특성 인코딩하기

In [None]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250
...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797


In [None]:
# 어휘 사전을 정의하니다.
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
# 범주에 해당하는 인덱스를 만듭니다.
indices = tf.range(len(vocab), dtype=tf.int64)

####KeyValueTensorInitializer

####StaticVocabularyTable

#### table.lookup

In [None]:
# 어휘 사전과 인덱스를 전달하여 룩업 테이블 초기화 객체를 생성합니다.
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
# 정의되지 않은 범주를 담을 bucket(버킷)의 수를 정의합니다.
num_oov_buckets = 2
# table을 초기화 합니다.
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [None]:
# lookup table에서 key list를 value list로 변환
categories = tf.constant(['NEAR BAY', 'DESERT', 'INLAND', 'ISLAND'])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 4])>

In [None]:
table[categories]

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 4])>

#### tf.one_hot

In [None]:
# 인덱스를 갖는 value list를 원-핫 인코딩
tf.one_hot(cat_indices, depth=len(vocab)+num_oov_buckets)

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.]], dtype=float32)>

###13.3.2 임베딩을 사용해 범주형 특성 인코딩하기

In [None]:
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab)+num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.47571564, 0.5559275 ],
       [0.09746027, 0.8046615 ],
       [0.5173979 , 0.10759997],
       [0.63622975, 0.483765  ],
       [0.95979166, 0.32256567],
       [0.39404988, 0.3427267 ],
       [0.19925404, 0.81551564]], dtype=float32)>

####tf.nn.embedding_lookup

In [None]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.63622975, 0.483765  ],
       [0.39404988, 0.3427267 ],
       [0.09746027, 0.8046615 ],
       [0.95979166, 0.32256567]], dtype=float32)>

####tf.keras.layers.Embedding

In [None]:
embedding = keras.layers.Embedding(input_dim=len(vocab)+num_oov_buckets, output_dim=embedding_dim)

In [None]:
embedding(cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.01732547,  0.04977554],
       [-0.00997026, -0.0164363 ],
       [-0.03485181, -0.01046437],
       [-0.00026436, -0.01370205]], dtype=float32)>

In [None]:
regular_input = keras.layers.Input(shape=[8])
categories = keras.layers.Input(shape=[], dtype=tf.string)

cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats))(categories)
cat_embed = keras.layers.Embedding(input_dim=6, output_dim=2)(cat_indices)
encoded_inputs = keras.layers.concatenate([regular_input, cat_embed])
outputs = keras.layers.Dense(1)(encoded_inputs)

model = keras.models.Model(inputs=[regular_input, categories],
                           outputs=[outputs])

In [None]:
model.apply

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (None,)              0           input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 2)            12          lambda[0][0]                     
______________________________________________________________________________________________

###13.3.3 케라스의 전처리 층

##13.6 연습문제

###10

#### a

In [None]:
from pathlib import Path

DOWNLOAD_ROOT = "http://ai.stanford.edu/~amaas/data/sentiment/"
FILENAME = "aclImdb_v1.tar.gz"
# 28초 소요
filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)

PosixPath('/root/.keras/datasets/aclImdb')

In [None]:
path = Path(filepath).parent / "aclImdb"
path

PosixPath('/root/.keras/datasets/aclImdb')

In [None]:
for name, subdirs, files in os.walk(path):
    # name: 루트(path)의 모든 하위 경로
    # subdirs: name의 하위 경로
    # files: name의 파일
    indent = len(Path(name).parts) - len(path.parts)
    print("    " * (indent-1) + "*-- " + Path(name).parts[-1] + os.sep)
    for index, filename in enumerate(sorted(files)):
        if index == 2:
            print("    " * (indent) + "*-- " + "...")
            break
        print("    " * (indent) + "*-- " + filename)    

*-- aclImdb/
*-- README
*-- imdb.vocab
*-- ...
*-- test/
    *-- labeledBow.feat
    *-- urls_neg.txt
    *-- ...
    *-- pos/
        *-- 0_10.txt
        *-- 10000_7.txt
        *-- ...
    *-- neg/
        *-- 0_2.txt
        *-- 10000_4.txt
        *-- ...
*-- train/
    *-- labeledBow.feat
    *-- unsupBow.feat
    *-- ...
    *-- unsup/
        *-- 0_0.txt
        *-- 10000_0.txt
        *-- ...
    *-- pos/
        *-- 0_9.txt
        *-- 10000_8.txt
        *-- ...
    *-- neg/
        *-- 0_3.txt
        *-- 10000_4.txt
        *-- ...


In [None]:
def review_paths(dirpath):
    return [str(path) for path in dirpath.glob('*.txt')]

In [None]:
train_pos = review_paths(path/'train'/'pos')
train_neg = review_paths(path/'train'/'neg')
test_valid_pos = review_paths(path/'test'/'pos')
test_valid_neg = review_paths(path/'test'/'neg')
len(train_pos), len(train_neg), len(test_valid_pos), len(test_valid_neg)

(12500, 12500, 12500, 12500)

####b

In [None]:
np.random.shuffle(test_valid_pos)

test_pos = test_valid_pos[:5000]
test_neg = test_valid_neg[:5000]
valid_pos = test_valid_pos[5000:]
valid_neg = test_valid_neg[5000:]

#### c

In [None]:
def imdb_dataset(filepaths_positive, filepaths_negative):
    reviews = []
    labels = []
    for filepaths, label in ((filepaths_negative, 0), (filepaths_positive, 1)):
        for filepath in filepaths:
            with open(filepath) as review_file:
                reviews.append(review_file.read())
            labels.append(label)
    return tf.data.Dataset.from_tensor_slices(
        (tf.constant(reviews), tf.constant(labels)))

In [None]:
%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).repeat(2): pass

1 loop, best of 1: 2.78 s per loop


##### tf.data.TextLineDataset

In [None]:
def imdb_dataset(filepaths_positive, filepaths_negative, n_read_threads=5):
    dataset_neg = tf.data.TextLineDataset(filepaths_negative,
                                          num_parallel_reads=n_read_threads)
    dataset_neg = dataset_neg.map(lambda review: (review, 0))
    dataset_pos = tf.data.TextLineDataset(filepaths_positive,
                                          num_parallel_reads=n_read_threads)
    dataset_pos = dataset_pos.map(lambda review: (review, 1))
    return tf.data.Dataset.concatenate(dataset_pos, dataset_neg)

In [None]:
%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).repeat(2): pass

1 loop, best of 1: 15.2 s per loop


##### cache

In [None]:
%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).cache().repeat(2): pass

1 loop, best of 1: 20.6 s per loop


In [None]:
batch_size = 32

train_set = imdb_dataset(train_pos, train_neg).shuffle(25000).batch(batch_size).prefetch(1)
valid_set = imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)
test_set = imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)

#### d

In [None]:
for item in train_set.take(1):
    print(item)

(<tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'I will admit that I have seen maybe five minutes of "Jerry Springer". I don\'t consider myself a snob, but I really think that I am above watching what\'s on his show. You should try to elevate yourself above that too.<br /><br />I saw this movie as part of a social studies event I was conducting. I was told that this movie really had little to do with Springer himself, rather it centered on the lives of those who would appear on "Springer." Handled better, this movie might have actually been a fascinating look at how pathetic these people\'s lives actually are. I will admit, I felt a twinge of empathy for Connie (Molly Hagan). This is all she has in life. How sad that she feels she must go on Jerry\'s show in order to resolve this.<br /><br />I really feel sorry for Molly Hagan appearing in this. Have you noticed that after this movie, she has mainly been relegated to "B" roles on TV? I will say this about Hagan. She is an extrem