# 1 - Embedding

Agora, em vez de substituir cada palavra por um número, iremos identificar as palavras como vetores em um espaço de n-dimensões.

O processo de Embedding é fazer como um conjunto de palavras ou palavras associadas formem vetores em um espaço multidimensional

Nesse semana de curso, será usado o dataset do IMDB, para podermos construir um classificador de críticas em relação a um filme.

In [1]:
#!pip install -q tensorflow-datasets

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
#print(tf.__version__)

In [3]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [4]:
train_data, test_data = imdb['train'], imdb['test']

In [5]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

for l,s in train_data:
    training_sentences.append(str(s.numpy()))
    training_labels.append(l.numpy())


for l,s in test_data:
    testing_sentences.append(str(s.numpy()))
    testing_labels.append(l.numpy())

In [8]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

### Configurações:

In [22]:
vocab_size = 10000
embeding_dim = 16
max_lenght = 1000
trunc_type = 'post'
oov_tok = 'XXXXXX'

### Imports

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
tokenizer = Tokenizer(  num_words=vocab_size,
                        oov_token= oov_tok)

tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences( sequences, 
                        maxlen=max_lenght,
                        truncating = trunc_type)

In [18]:
# Tokens do vocabulário no exame de teste.
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences( testing_sequences,
                                maxlen = max_lenght)

### Definindo a Rede Neural

Model 1

In [23]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embeding_dim, input_length = max_lenght), # Aqui é onde a magia realmente acontece
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [30]:
model.compile(  loss = 'binary_crossentropy',
                optimizer = 'adam',
                metrics = ['accuracy'])

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 16)          160000    
                                                                 
 flatten (Flatten)           (None, 16000)             0         
                                                                 
 dense (Dense)               (None, 6)                 96006     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 256,013
Trainable params: 256,013
Non-trainable params: 0
_________________________________________________________________


Model 2

In [26]:
model_2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embeding_dim, input_length = max_lenght), # Aqui é onde a magia realmente acontece
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [40]:
model_2.compile(loss = 'binary_crossentropy',
                optimizer = 'adam',
                metrics = ['accuracy'])

In [41]:
model_2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1000, 16)          160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_2 (Dense)             (None, 6)                 102       
                                                                 
 dense_3 (Dense)             (None, 1)                 7         
                                                                 
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


### Treino e teste

Model 1

In [32]:
num_epochs = 10

In [44]:
model.fit(  padded,
            training_labels_final,
            epochs = num_epochs,
            validation_data = (testing_padded, testing_labels_final))

Epoch 1/10


UnimplementedError:  Cast string to float is not supported
	 [[node binary_crossentropy/Cast
 (defined at c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\losses.py:1797)
]] [Op:__inference_train_function_101066]

Errors may have originated from an input operation.
Input Source operations connected to node binary_crossentropy/Cast:
In[0] ExpandDims (defined at c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\engine\compile_utils.py:677)

Operation defined at: (most recent call last)
>>>   File "c:\users\kaisson.ferreira\anaconda3\lib\runpy.py", line 194, in _run_module_as_main
>>>     return _run_code(code, main_globals, None,
>>> 
>>>   File "c:\users\kaisson.ferreira\anaconda3\lib\runpy.py", line 87, in _run_code
>>>     exec(code, run_globals)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
>>>     app.launch_new_instance()
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
>>>     app.start()
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
>>>     self.io_loop.start()
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
>>>     self.asyncio_loop.run_forever()
>>> 
>>>   File "c:\users\kaisson.ferreira\anaconda3\lib\asyncio\base_events.py", line 570, in run_forever
>>>     self._run_once()
>>> 
>>>   File "c:\users\kaisson.ferreira\anaconda3\lib\asyncio\base_events.py", line 1859, in _run_once
>>>     handle._run()
>>> 
>>>   File "c:\users\kaisson.ferreira\anaconda3\lib\asyncio\events.py", line 81, in _run
>>>     self._context.run(self._callback, *self._args)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\ipykernel\kernelbase.py", line 461, in dispatch_queue
>>>     await self.process_one()
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\ipykernel\kernelbase.py", line 450, in process_one
>>>     await dispatch(*args)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\ipykernel\kernelbase.py", line 357, in dispatch_shell
>>>     await result
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\ipykernel\kernelbase.py", line 652, in execute_request
>>>     reply_content = await reply_content
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
>>>     res = shell.run_cell(code, store_history=store_history, silent=silent)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
>>>     return super().run_cell(*args, **kwargs)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\IPython\core\interactiveshell.py", line 2768, in run_cell
>>>     result = self._run_cell(
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\IPython\core\interactiveshell.py", line 2814, in _run_cell
>>>     return runner(coro)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
>>>     coro.send(None)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\IPython\core\interactiveshell.py", line 3012, in run_cell_async
>>>     has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\IPython\core\interactiveshell.py", line 3191, in run_ast_nodes
>>>     if await self.run_code(code, result, async_=asy):
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\IPython\core\interactiveshell.py", line 3251, in run_code
>>>     exec(code_obj, self.user_global_ns, self.user_ns)
>>> 
>>>   File "C:\Users\kaisson.ferreira\AppData\Local\Temp\ipykernel_15896\2901008831.py", line 1, in <module>
>>>     model.fit(  padded,
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\engine\training.py", line 1216, in fit
>>>     tmp_logs = self.train_function(iterator)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\engine\training.py", line 878, in train_function
>>>     return step_function(self, iterator)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\engine\training.py", line 867, in step_function
>>>     outputs = model.distribute_strategy.run(run_step, args=(data,))
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\engine\training.py", line 860, in run_step
>>>     outputs = model.train_step(data)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\engine\training.py", line 809, in train_step
>>>     loss = self.compiled_loss(
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
>>>     loss_value = loss_obj(y_t, y_p, sample_weight=sw)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\losses.py", line 141, in __call__
>>>     losses = call_fn(y_true, y_pred)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\losses.py", line 245, in call
>>>     return ag_fn(y_true, y_pred, **self._fn_kwargs)
>>> 
>>>   File "c:\estudo_python\NLP\curso_nlp\lib\site-packages\keras\losses.py", line 1797, in binary_crossentropy
>>>     y_true = tf.cast(y_true, y_pred.dtype)
>>> 