### Uso de tokenizer

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

#Define input sentences
sentences = [
            'I love my dog',
            'I love my cat',
            'You love my dog'
]

#Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = 100)

#Generate indices for each word in the corpus
tokenizer.fit_on_texts(sentences)

#Get the indices and print it
word_index = tokenizer.word_index
print(word_index)

2024-04-08 12:29:05.753319: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-08 12:29:05.968247: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-08 12:29:05.968292: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-08 12:29:06.002182: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-08 12:29:06.080954: I tensorflow/core/platform/cpu_feature_guar

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [2]:
word_index['cat']

5

### Text to Sequences

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

#Define input sentences
sentences = [
            'I love my dog',
            'I love my cat',
            'You love my dog',
            'Do you think my dog is amazing?'
]

#Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = 100)

#Generate indices for each word in the corpus
tokenizer.fit_on_texts(sentences)

#Get the indices
word_index = tokenizer.word_index

#Generate list of token sequences
sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [4]:
type(sequences)

list

In [5]:
test_data = [
            'i really love my dog',
            'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[4, 2, 1, 3], [1, 3, 1]]


### Out-of-vocabulary tokens. Consider words that are not found in the word_index dictionary
#### oov : out-of-vocabulary

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

#Define input sentences
sentences = [
            'I love my dog',
            'I love my cat',
            'You love my dog',
            'Do you think my dog is amazing?'
]

#Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = 100, oov_token = "<00V>")

#Generate indices for each word in the corpus
tokenizer.fit_on_texts(sentences)

#Get the indices
word_index = tokenizer.word_index

#Generate list of token sequences
sequences = tokenizer.texts_to_sequences(sentences)

test_data = [
            'i really love my dog',
            'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [7]:
word_index

{'<00V>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'do': 8,
 'think': 9,
 'is': 10,
 'amazing': 11}

### Padding 
### para uniformizar la longitud de la data

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Define input sentences
sentences = [
            'I love my dog',
            'I love my cat',
            'You love my dog',
            'Do you think my dog is amazing?'
]

#Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = 100, oov_token = "<00V>")

#Generate indices for each word in the corpus
tokenizer.fit_on_texts(sentences)

#Get the indices
word_index = tokenizer.word_index

#Generate list of token sequences
sequences = tokenizer.texts_to_sequences(sentences)

#Pad the sequences to a uniform length
#padded = pad_sequences(sequences)
padded = pad_sequences(sequences, padding='post', truncating= 'pre', maxlen= 5)

print("\n Word Index =", word_index)
print("\n Sequences =",sequences)
print("\n Padded Sequences :")
print(padded)


 Word Index = {'<00V>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

 Sequences = [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

 Padded Sequences :
[[ 5  3  2  4  0]
 [ 5  3  2  7  0]
 [ 6  3  2  4  0]
 [ 9  2  4 10 11]]


In [9]:
padded[0]

array([5, 3, 2, 4, 0], dtype=int32)

### Word Embeddings (assigns vector to each word with its associated sentiment)
##### IMDB reviews


In [10]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [11]:
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised= True)

In [12]:
info
imdb
print(len(imdb['train']))
print(len(imdb['test']))

25000
25000


In [13]:
# Data is split into 25000 samples for training and 25000 for testing

import numpy as np
train_data, test_data = imdb['train'], imdb['test']

In [14]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

#loop over all training examples and save the sentences and labels
for s,l in train_data:
    training_sentences.append(s.numpy().decode('utf8'))
    training_labels.append(l.numpy())

# s extracts first element, l extracts second element
# .numpy() converts a tensorflow tensor to a numpy array
# .decode('utf8') converts into a regular python string 


In [15]:
len(training_labels)
for i in range(0,5):
    print(training_sentences[i],training_labels[i])

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it. 0
I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was co

In [16]:
#loop over all test examples and save the sentences and labels
for s,l in test_data:
    testing_sentences.append(s.numpy().decode('utf8'))
    testing_labels.append(l.numpy())

In [17]:
for i in range(0,5):
    print(testing_sentences[i],testing_labels[i])

There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING-A-LING-LESS. Flawless film-making, and as assured and as professional as any of the aforementioned movies. I haven't laughed this hard since I saw THE FULL MONTY. (And, even then, I don't think I laughed quite this hard... So to speak.) Tukel's talent is considerable: DING-A-LING-LESS is so chock full of double entendres that one would have to sit down with a copy of this script and do a line-by-line examination of it to fully appreciate the, uh, breadth and width of it. Every shot is beautifully composed (a clear sign of a sure-handed director), and the performances all around are solid (there's none of the over-the-top scenery chewing one might've expected from a film like this). DING-A-LING-LESS is a film whose time has come. 1
A blackly comic tale of a down-trodden priest, Nazarin sho

In [18]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)
print(type(testing_labels))
print(type(testing_labels_final))

<class 'list'>
<class 'numpy.ndarray'>


In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = "<OOV>"

#creating tokenizer object
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
# training the tokenizer on the training_sentences list
tokenizer.fit_on_texts(training_sentences)
#storing the mapping between each word and its corresponding integer index
word_index = tokenizer.word_index
#Converting each sentence into a sequence of integers
sequences = tokenizer.texts_to_sequences(training_sentences)
#padding or truncating the sequences to ensure they all have the same length
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

#same opperaton as the last two ones
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

In [21]:
type(word_index)

dict

In [22]:
model = tf.keras.Sequential([
    #Embedding layer : from integer sequences to dense vector representation (embeddings)
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    #The embedding layer is a 3D tensor (batch_size, sequence_length, embedding_dim)
    #Flatten layer reshapes this tensor into a single dimension for each sample in the batch
    tf.keras.layers.Flatten(),
    # or tf.keras.layers.GlobalAveragePooling1D(),
    #Fully-connected layer with 6 neurons
    tf.keras.layers.Dense(6, activation='relu'),
    #Fully-connected layer with 1 neuron
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [23]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           160000    
                                                                 
 flatten (Flatten)           (None, 1920)              0         
                                                                 
 dense (Dense)               (None, 6)                 11526     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 171533 (670.05 KB)
Trainable params: 171533 (670.05 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [24]:
num_epochs = 10
model.fit(padded,
          training_labels_final,
          epochs=num_epochs,
          validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x734884530a00>