In [None]:
# Importaciones necesarias
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Activation, Dropout, Dense, Flatten, LSTM, SimpleRNN, Embedding, Input
from sklearn.model_selection import train_test_split
import os
import gdown
import json

# 1. Datos
def download_dataset():
    if not os.path.exists('data_volunteers.json'):
        url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
        output = 'data_volunteers.json'
        gdown.download(url, output, quiet=False)
    else:
        print("El dataset ya se encuentra descargado")

def load_dataset():
    with open("data_volunteers.json") as f:
        data = json.load(f)
    return data

def clean_text(txt):
    txt = txt.lower()
    txt = txt.replace("\'d", " had").replace("\'s", " is").replace("\'m", " am").replace("don't", "do not")
    txt = re.sub(r'\W+', ' ', txt)
    return txt

def prepare_data(data, max_len=30):
    input_sentences, output_sentences, output_sentences_inputs = [], [], []

    for line in data:
        for i in range(len(line['dialog'])-1):
            chat_in = clean_text(line['dialog'][i]['text'])
            chat_out = clean_text(line['dialog'][i+1]['text'])

            if len(chat_in) >= max_len or len(chat_out) >= max_len:
                continue

            input_sentences.append(chat_in)
            output_sentences.append(chat_out + ' <eos>')
            output_sentences_inputs.append('<sos> ' + chat_out)

    print("Cantidad de rows utilizadas:", len(input_sentences))
    return input_sentences, output_sentences, output_sentences_inputs

# 2. Preprocesamiento
def preprocess_data(input_sentences, output_sentences, output_sentences_inputs):
    # Tokenización de entradas
    tokenizer_inputs = Tokenizer()
    tokenizer_inputs.fit_on_texts(input_sentences)
    input_sequences = tokenizer_inputs.texts_to_sequences(input_sentences)

    # Tokenización de salidas
    tokenizer_outputs = Tokenizer(filters='')
    tokenizer_outputs.fit_on_texts(output_sentences + output_sentences_inputs)
    output_sequences = tokenizer_outputs.texts_to_sequences(output_sentences)
    output_sequences_inputs = tokenizer_outputs.texts_to_sequences(output_sentences_inputs)

    # Padding
    encoder_input_sequences = pad_sequences(input_sequences)
    decoder_output_sequences = pad_sequences(output_sequences)
    decoder_input_sequences = pad_sequences(output_sequences_inputs)

    # Creación de diccionarios y parámetros
    word2idx_inputs = tokenizer_inputs.word_index
    word2idx_outputs = tokenizer_outputs.word_index
    max_input_len = encoder_input_sequences.shape[1]
    max_out_len = decoder_output_sequences.shape[1]
    num_words_output = len(word2idx_outputs) + 1

    return (word2idx_inputs, max_input_len, word2idx_outputs, max_out_len, num_words_output,
            encoder_input_sequences, decoder_output_sequences, decoder_input_sequences)


In [None]:
embedding_dim = 100
embeddings_index = {}
MAX_VOCAB_SIZE = 6000
# Descargar los embeddings desde un google drive (es la forma más rápida)
# NOTA: No hay garantía de que estos links perduren, en caso de que no estén
# disponibles descargar de la página oficial como se explica en el siguiente bloque de código
import os
import gdown
if os.access('gloveembedding.pkl', os.F_OK) is False:
    url = 'https://drive.google.com/uc?id=1KY6avD5I1eI2dxQzMkR3WExwKwRq2g94&export=download'
    output = 'gloveembedding.pkl'
    gdown.download(url, output, quiet=False)
else:
    print("Los embeddings gloveembedding.pkl ya están descargados")

Los embeddings gloveembedding.pkl ya están descargados


In [None]:
import logging
import os
from pathlib import Path
from io import StringIO
import pickle
import numpy as np

class WordsEmbeddings(object):
    logger = logging.getLogger(__name__)

    def __init__(self):
        # load the embeddings
        words_embedding_pkl = Path(self.PKL_PATH)
        if not words_embedding_pkl.is_file():
            words_embedding_txt = Path(self.WORD_TO_VEC_MODEL_TXT_PATH)
            assert words_embedding_txt.is_file(), 'Words embedding not available'
            embeddings = self.convert_model_to_pickle()
        else:
            embeddings = self.load_model_from_pickle()
        self.embeddings = embeddings
        # build the vocabulary hashmap
        index = np.arange(self.embeddings.shape[0])
        # Dicctionarios para traducir de embedding a IDX de la palabra
        self.word2idx = dict(zip(self.embeddings['word'], index))
        self.idx2word = dict(zip(index, self.embeddings['word']))

    def get_words_embeddings(self, words):
        words_idxs = self.words2idxs(words)
        return self.embeddings[words_idxs]['embedding']

    def words2idxs(self, words):
        return np.array([self.word2idx.get(word, -1) for word in words])

    def idxs2words(self, idxs):
        return np.array([self.idx2word.get(idx, '-1') for idx in idxs])

    def load_model_from_pickle(self):
        self.logger.debug(
            'loading words embeddings from pickle {}'.format(
                self.PKL_PATH
            )
        )
        max_bytes = 2**28 - 1 # 256MB
        bytes_in = bytearray(0)
        input_size = os.path.getsize(self.PKL_PATH)
        with open(self.PKL_PATH, 'rb') as f_in:
            for _ in range(0, input_size, max_bytes):
                bytes_in += f_in.read(max_bytes)
        embeddings = pickle.loads(bytes_in)
        self.logger.debug('words embeddings loaded')
        return embeddings

    def convert_model_to_pickle(self):
        # create a numpy strctured array:
        # word     embedding
        # U50      np.float32[]
        # word_1   a, b, c
        # word_2   d, e, f
        # ...
        # word_n   g, h, i
        self.logger.debug(
            'converting and loading words embeddings from text file {}'.format(
                self.WORD_TO_VEC_MODEL_TXT_PATH
            )
        )
        structure = [('word', np.dtype('U' + str(self.WORD_MAX_SIZE))),
                     ('embedding', np.float32, (self.N_FEATURES,))]
        structure = np.dtype(structure)
        # load numpy array from disk using a generator
        with open(self.WORD_TO_VEC_MODEL_TXT_PATH, encoding="utf8") as words_embeddings_txt:
            embeddings_gen = (
                (line.split()[0], line.split()[1:]) for line in words_embeddings_txt
                if len(line.split()[1:]) == self.N_FEATURES
            )
            embeddings = np.fromiter(embeddings_gen, structure)
        # add a null embedding
        null_embedding = np.array(
            [('null_embedding', np.zeros((self.N_FEATURES,), dtype=np.float32))],
            dtype=structure
        )
        embeddings = np.concatenate([embeddings, null_embedding])
        # dump numpy array to disk using pickle
        max_bytes = 2**28 - 1 # # 256MB
        bytes_out = pickle.dumps(embeddings, protocol=pickle.HIGHEST_PROTOCOL)
        with open(self.PKL_PATH, 'wb') as f_out:
            for idx in range(0, len(bytes_out), max_bytes):
                f_out.write(bytes_out[idx:idx+max_bytes])
        self.logger.debug('words embeddings loaded')
        return embeddings

class GloveEmbeddings(WordsEmbeddings):
    WORD_TO_VEC_MODEL_TXT_PATH = 'glove.twitter.27B.50d.txt'
    PKL_PATH = 'gloveembedding.pkl'
    N_FEATURES = 50
    WORD_MAX_SIZE = 60

class FasttextEmbeddings(WordsEmbeddings):
    WORD_TO_VEC_MODEL_TXT_PATH = 'cc.en.300.vec'
    PKL_PATH = 'fasttext.pkl'
    N_FEATURES = 300
    WORD_MAX_SIZE = 60

In [None]:

# 3. Preparar los embeddings
def prepare_embeddings(word2idx, embedding_dim=100):

    # Por una cuestion de RAM se utilizarán los embeddings de Glove de dimension 100
    model_embeddings = GloveEmbeddings()

    # Crear la Embedding matrix de las secuencias

    print('preparing embedding matrix...')
    embed_dim = model_embeddings.N_FEATURES
    words_not_found = []

    # word_index provieen del tokenizer

    nb_words = min(MAX_VOCAB_SIZE, len(word2idx_inputs)) # vocab_size
    embedding_matrix = np.zeros((nb_words, embed_dim))
    for word, i in word2idx_inputs.items():
        if i >= nb_words:
            continue
        embedding_vector = model_embeddings.get_words_embeddings(word)[0]
        if (embedding_vector is not None) and len(embedding_vector) > 0:

            embedding_matrix[i] = embedding_vector
        else:
            # words not found in embedding index will be all-zeros.
            words_not_found.append(word)

    print('number of null word embeddings:', np.sum(np.sum(embedding_matrix**2, axis=1) == 0))

    return embedding_matrix


In [None]:

# 4. Entrenar el modelo
def create_model(num_encoder_tokens, num_decoder_tokens, embedding_dim, latent_dim):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(num_encoder_tokens, embedding_dim, name='encoder_embedding')
    encoder_embedded = encoder_embedding(encoder_inputs)
    encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedded)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(num_decoder_tokens, embedding_dim, name='decoder_embedding')
    decoder_embedded = decoder_embedding(decoder_inputs)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(decoder_embedded, initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

    return model

def train_model(model, encoder_input_data, decoder_input_data, decoder_target_data, batch_size, epochs):
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2)

# 5. Inferencia
def create_inference_models(model):
    # Encoder
    encoder_inputs = model.input[0]  # input_1
    encoder_embedding = model.get_layer('encoder_embedding')(encoder_inputs)
    encoder_outputs, state_h_enc, state_c_enc = model.get_layer('encoder_lstm')(encoder_embedding)
    encoder_states = [state_h_enc, state_c_enc]
    encoder_model = Model(encoder_inputs, encoder_states)

    # Decoder
    decoder_inputs = model.input[1]  # input_2
    decoder_state_input_h = Input(shape=(model.get_layer('decoder_lstm').units,))
    decoder_state_input_c = Input(shape=(model.get_layer('decoder_lstm').units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    decoder_embedding = model.get_layer('decoder_embedding')(decoder_inputs)
    decoder_outputs, state_h, state_c = model.get_layer('decoder_lstm')(
        decoder_embedding, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = model.get_layer('decoder_dense')(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)

    return encoder_model, decoder_model

# Función para decodificar
def decode_sequence(input_seq, encoder_model, decoder_model, max_decoder_seq_length, num_decoder_tokens):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = word2idx_outputs['<sos>']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = idx2word_outputs[sampled_token_index]
        decoded_sentence += ' ' + sampled_word

        # Exit condition: either hit max length or find stop character.
        if (sampled_word == '<eos>' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

# Ejecución principal
if __name__ == "__main__":
    # 1. Datos
    download_dataset()
    data = load_dataset()
    input_sentences, output_sentences, output_sentences_inputs = prepare_data(data)

    # 2. Preprocesamiento
    (word2idx_inputs, max_input_len, word2idx_outputs, max_out_len, num_words_output,
     encoder_input_sequences, decoder_output_sequences, decoder_input_sequences) = preprocess_data(
        input_sentences, output_sentences, output_sentences_inputs)

    # 3. Preparar los embeddings
    embedding_dim = 100
    embedding_matrix_inputs = prepare_embeddings(word2idx_inputs, embedding_dim)
    embedding_matrix_outputs = prepare_embeddings(word2idx_outputs, embedding_dim)

    # 4. Entrenar el modelo
    latent_dim = 256
    model = create_model(len(word2idx_inputs) + 1, num_words_output, embedding_dim, latent_dim)

    # Convertir las secuencias de salida en one-hot encoding
    decoder_target_data = tf.keras.utils.to_categorical(decoder_output_sequences, num_classes=num_words_output)

    train_model(model, encoder_input_sequences, decoder_input_sequences, decoder_target_data,
                batch_size=64, epochs=50)

    # 5. Inferencia
    encoder_model, decoder_model = create_inference_models(model)

    # Crear diccionario inverso para las salidas
    idx2word_outputs = {i: word for word, i in word2idx_outputs.items()}

    # Ejemplo de inferencia
    input_seq = encoder_input_sequences[0:1]
    decoded_sentence = decode_sequence(input_seq, encoder_model, decoder_model, max_out_len, num_words_output)
    print('Input sentence:', input_sentences[0])
    print('Decoded sentence:', decoded_sentence)

El dataset ya se encuentra descargado
Cantidad de rows utilizadas: 5985
preparing embedding matrix...
number of null word embeddings: 38
preparing embedding matrix...
number of null word embeddings: 38
Epoch 1/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 365ms/step - loss: 4.1207 - val_loss: 2.1157
Epoch 2/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 358ms/step - loss: 2.0164 - val_loss: 1.9685
Epoch 3/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 361ms/step - loss: 1.8138 - val_loss: 1.8402
Epoch 4/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 359ms/step - loss: 1.6680 - val_loss: 1.7345
Epoch 5/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 358ms/step - loss: 1.5589 - val_loss: 1.6741
Epoch 6/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 359ms/step - loss: 1.4535 - val_loss: 1.6214
Epoch 7/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s

In [None]:
def generate_inference(input_string, encoder_model, decoder_model, word2idx_inputs, word2idx_outputs, idx2word_outputs, max_out_len):
    # Tokenizar y padear la entrada
    input_seq = [word2idx_inputs.get(word, 0) for word in input_string.lower().split()]
    input_seq = pad_sequences([input_seq], maxlen=encoder_model.input_shape[1], padding='post')

    # Codificar la entrada
    states_value = encoder_model.predict(input_seq)

    # Generar secuencia objetivo vacía de longitud 1
    target_seq = np.zeros((1, 1))
    # Poblar el primer carácter de la secuencia objetivo con el carácter de inicio
    target_seq[0, 0] = word2idx_outputs['<sos>']

    # Bucle de muestreo
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Muestrear un token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = idx2word_outputs[sampled_token_index]

        # Añadir la palabra a la oración decodificada solo si no es <eos>
        if sampled_word != '<eos>':
            decoded_sentence += ' ' + sampled_word

        # Condición de salida: longitud máxima o encontrar carácter de parada
        if (sampled_word == '<eos>' or len(decoded_sentence.split()) > max_out_len):
            stop_condition = True

        # Actualizar la secuencia objetivo (de longitud 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Actualizar estados
        states_value = [h, c]

    return decoded_sentence.strip()


In [None]:
# Ejemplo de uso:
input_string = "Hello, can you help me?"
inference_result = generate_inference(input_string, encoder_model, decoder_model, word2idx_inputs, word2idx_outputs, idx2word_outputs, max_out_len)
print('Input sentence:', input_string)
print('Decoded sentence:', inference_result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Input sentence: Hello, can you help me?
Decoded sentence: i am a vegan coach
