### Loading a dataset

In [35]:
import tensorflow as tf
import tensorflow_datasets as tfds

dataset, info = tfds.load(
    "imdb_reviews",
    as_supervised=True,
    with_info=True
)

train_data, test_data = dataset["train"], dataset["test"]

for txt, label in train_data.take(3):
    print(f"Texto: {txt.numpy().decode('utf-8')}")
    print(f"Label: {label.numpy()}\n")

Texto: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
Label: 0

Texto: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The p

2025-03-10 10:44:06.854843: W tensorflow/core/kernels/data/cache_dataset_ops.cc:914] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


### Vetorization

In [36]:
VOCAB_SIZE = 10_000
MAX_LEN = 100

vectorizer = tf.keras.layers.TextVectorization( # camada de vetorização
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN
)

train_text = train_data.map(lambda txt, label: txt) # trabalharemos só com os textos nessa etapa
vectorizer.adapt(train_text.batch(64))

2025-03-10 10:44:12.364995: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [37]:
def preprocess(txt, label):
    text = vectorizer(txt)
    return text, label

### Saving data as tfrecord

In [38]:
import numpy as np

def serialize_example(txt_tensor, label):
    text_feature = txt_tensor.numpy().astype(np.int64)
    label_feature = np.array(label.numpy(), dtype=np.int64)

    feature = {
        'text': tf.train.Feature(int64_list=tf.train.Int64List(value=text_feature)),
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[label_feature]))
    }

    example = tf.train.Example(features=tf.train.Features(feature=feature))
    return example.SerializeToString()

def save_as_tfrecord(dataset, filename):
    with tf.io.TFRecordWriter(filename) as writer:
        for text, label in dataset:
            serialized_example = serialize_example(text, label)
            writer.write(serialized_example)

In [None]:
train_data_processed = train_data.map(preprocess) # salvando os dados já pré-processados
test_data_processed = test_data.map(preprocess)   # salvando os dados já pré-processados

save_as_tfrecord(train_data_processed, 'train.tfrecord')
save_as_tfrecord(test_data_processed, 'test.tfrecord')

### Loading data

In [40]:
def parse_tfrecord_fn(serialized_example):
    feature_description = {
        'text': tf.io.FixedLenFeature([MAX_LEN], tf.int64),
        'label': tf.io.FixedLenFeature([], tf.int64)
    }

    example = tf.io.parse_single_example(serialized_example, feature_description)
    return example['text'], example['label']

In [41]:
train_dataset = tf.data.TFRecordDataset('train.tfrecord').map(parse_tfrecord_fn)
test_dataset = tf.data.TFRecordDataset('test.tfrecord').map(parse_tfrecord_fn)

In [42]:
def apply_preprocess(train_data, test_data):
    train_data = train_data.shuffle(10_000).batch(32).prefetch(tf.data.AUTOTUNE)
    test_data = test_data.batch(32).prefetch(tf.data.AUTOTUNE)
    return train_data, test_data

In [43]:
train_data, test_data = apply_preprocess(train_dataset, test_dataset)

In [44]:
train_data

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 100), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [45]:
test_data

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 100), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [46]:
for texts, labels in train_data.take(1):
    print(f"Shape dos textos: {texts.shape}")
    print(f"Shape dos labels: {labels.shape}")
    print(f"Exemplo de texto: {texts[0]}")
    print(f"Exemplo de label: {labels[0]}")

2025-03-10 10:45:45.660410: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:370] TFRecordDataset `buffer_size` is unspecified, default to 262144


Shape dos textos: (32, 100)
Shape dos labels: (32,)
Exemplo de texto: [  11    7    2  240  151    2    1 3187   44  122    1   10   14    4
  548   51   11  366   46    3   10  124  199    9   14    1   54  149
   10  405    2  198    1   13  213   11   29  135   10  364   51    2
 3571 4655  230 2489    6   25    1   49    1 1188   24   30 2518  106
  439  782   21    2  204   16  571    4  653 8257   19   12  153 1389
   70   38   73   38   12   10  153   54  364   40  351   10  321   43
    2  120 1003   32    8   32   30 3758    1   13    2   61  869  151
   14    2]
Exemplo de label: 0


### Exploring tf.data API...

In [50]:
import pandas as pd

def load_csv(csv_file: str, batch_size: int, label: str, shuffle_size: int):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file,
        batch_size=batch_size,
        label_name=label,
        num_epochs=1,
        na_value="?",
        shuffle=True,
        shuffle_buffer_size=shuffle_size
    )

    return dataset

### Loading CSV with tf.data

In [53]:
data = load_csv(
    csv_file="../tf_data/data.csv",
    batch_size=32,
    label="time_id",
    shuffle_size=10_000
)

data

<_PrefetchDataset element_spec=(OrderedDict([('ping_ms', TensorSpec(shape=(None,), dtype=tf.float32, name=None)), ('temperature_c', TensorSpec(shape=(None,), dtype=tf.int32, name=None)), ('humidity_p', TensorSpec(shape=(None,), dtype=tf.int32, name=None))]), TensorSpec(shape=(None,), dtype=tf.string, name=None))>

### Reading it:

In [56]:
for features, labels in data.take(3):
    print("Features: ")
    for k, v in features.items():
        print(f"{k}: {v.numpy()}")
    
    print(f"Labels: {labels.numpy()}")

Features: 
ping_ms: [16.95 25.8  18.49 17.11 17.49 19.89 17.32 16.42 38.26 18.72 17.53 18.29
 17.99 32.72 17.97 17.77 28.81 16.9  24.93 29.02 17.22 51.96 22.52 17.38
 28.58 18.37 16.91 16.61 17.84 17.3  16.98 17.14]
temperature_c: [23 25 24 24 24 24 23 22 25 26 22 24 24 28 24 24 26 25 27 27 23 26 25 24
 23 25 25 24 21 25 23 23]
humidity_p: [41 41 42 48 32 47 41 50 33 48 35 48 42 28 42 39 31 34 28 30 48 33 30 31
 35 41 40 42 36 37 45 40]
Labels: [b'2021-10-02 07:19:02' b'2021-10-04 16:00:02' b'2021-10-01 19:34:01'
 b'2021-10-05 12:07:02' b'2021-10-01 12:09:01' b'2021-10-05 21:15:02'
 b'2021-10-02 02:27:02' b'2021-10-04 01:16:02' b'2021-10-03 12:08:02'
 b'2021-10-07 19:53:02' b'2021-10-07 13:48:02' b'2021-10-05 20:50:01'
 b'2021-10-04 07:05:02' b'2021-10-03 13:48:02' b'2021-09-30 21:54:01'
 b'2021-10-01 01:20:02' b'2021-10-03 18:02:01' b'2021-10-07 18:49:01'
 b'2021-10-03 13:29:01' b'2021-10-02 14:03:02' b'2021-10-05 15:50:02'
 b'2021-10-03 12:37:01' b'2021-10-02 23:00:02' b'2021-10-06 1

### Preparing for model:

In [58]:
def prepare_for_model(features, label):
    feature_tensor = tf.stack([
        features['ping_ms'],
        tf.cast(features['temperature_c'], tf.float32),
        tf.cast(features['humidity_p'], tf.float32)
    ], axis=1)
    
    return feature_tensor, label

model_ready_dataset = data.map(prepare_for_model)

In [59]:
model_ready_dataset

<_MapDataset element_spec=(TensorSpec(shape=(None, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None))>