<a href="https://colab.research.google.com/github/jonathancagua/NLP/blob/main/EX/Desafio_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Datos
El objecto es utilizar datos disponibles del challenge ConvAI2 (Conversational Intelligence Challenge 2) de conversaciones en inglés. Se construirá un BOT para responder a preguntas del usuario (QA).\
[LINK](http://convai.io/data/)

In [50]:
import pandas as pd
import requests
import json

# URL del dataset ConvAI2
url = "http://convai.io/data/summer_wild_evaluation_dialogs.json"

try:
    # Descargar el dataset
    response = requests.get(url)
    response.raise_for_status()  # Lanza excepción si hubo error HTTP
    convai2_json = response.json()
except requests.exceptions.RequestException as e:
    raise SystemExit(f"Error al descargar el dataset: {e}")
except json.JSONDecodeError:
    raise SystemExit("Error al decodificar el JSON recibido.")

# Extraer pares pregunta-respuesta
questions, answers = [], []

for dialog in convai2_json:
    utterances = dialog.get('dialog', [])
    for i in range(len(utterances) - 1):
        q = utterances[i].get('text', '').strip()
        a = utterances[i + 1].get('text', '').strip()
        if q and a:  # Ignorar si alguno está vacío
            questions.append(q)
            answers.append(a)

# Crear DataFrame con los primeros 15,000 pares
df = pd.DataFrame({'question': questions, 'answer': answers})
df = df.iloc[:15000].copy()

# Vista previa
df.head()


Unnamed: 0,question,answer
0,I love iphone! i just bought new iphone!,"Thats good for you, i'm not very into new tech"
1,"Thats good for you, i'm not very into new tech",I am a college student and i am a college student
2,I am a college student and i am a college student,I am go to gym and live on donations
3,I am go to gym and live on donations,I am a vegan and i am in the midwest
4,I am a vegan and i am in the midwest,So vegan... i have dogs maybe i should told th...


In [2]:
!pip install --upgrade nltk



In [51]:
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [52]:
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

# Diccionario ampliado de contracciones
CONTRACTIONS = {
    "i'm": "i am", "you're": "you are", "he's": "he is", "she's": "she is",
    "it's": "it is", "we're": "we are", "they're": "they are", "can't": "cannot",
    "won't": "will not", "don't": "do not", "didn't": "did not", "i've": "i have",
    "i'll": "i will", "you'll": "you will", "she'd": "she would", "should've": "should have",
    "there's": "there is", "we'd": "we would", "they'll": "they will", "wasn't": "was not",
    "isn't": "is not", "aren't": "are not", "couldn't": "could not", "wouldn't": "would not",
    "hasn't": "has not", "hadn't": "had not", "we'll": "we will", "they'd": "they would",
    "who's": "who is", "what's": "what is", "let's": "let us", "you've": "you have"
}

# Expande contracciones en el texto
def expand_contractions(text):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in CONTRACTIONS) + r')\b')
    return pattern.sub(lambda x: CONTRACTIONS[x.group()], text)

# Función de limpieza completa
def clean_text(text):
    text = text.lower()  # Minusculizar
    text = BeautifulSoup(text, "lxml").get_text()  # Eliminar HTML
    text = re.sub(r'http\S+', '', text)  # Eliminar URLs
    text = expand_contractions(text)  # Expandir contracciones
    text = re.sub(r'[^\w\s]', '', text)  # Eliminar puntuación
    text = re.sub(r'\d+', '', text)  # Eliminar números
    text = re.sub(r'\s+', ' ', text).strip()  # Normalizar espacios
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)  # Limitar letras repetidas (coooool → coool)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remover caracteres no ASCII
    text = re.sub(r'[\u2600-\u26FF\u263a-\U0001f645]', ' ', text)  # Remover emojis y símbolos Unicode
    tokens = word_tokenize(text)  # Tokenizar
    return ' '.join(tokens)

# Aplicar limpieza al DataFrame
df['question_clean'] = df['question'].astype(str).apply(clean_text)
df['answer_clean'] = df['answer'].astype(str).apply(clean_text)

# Agregar tokens de inicio y fin de secuencia
df['answer_clean'] = df['answer_clean'].apply(lambda x: '<sos> ' + x.strip() + ' <eos>')

# Filtrado y limpieza final
df = df.dropna(subset=['question_clean', 'answer_clean'])
df = df[(df['question_clean'].str.strip() != '') & (df['answer_clean'].str.strip() != '')]
df = df[df['question_clean'] != df['answer_clean']]
df = df[df['answer_clean'].str.split().apply(len) > 3]
df = df.drop_duplicates(subset=['question_clean', 'answer_clean'])

# Vista previa
print(f"Total de pares tras limpieza: {len(df)}")
display(df[['question_clean', 'answer_clean']].head())


Total de pares tras limpieza: 12647


Unnamed: 0,question_clean,answer_clean
0,i love iphone i just bought new iphone,<sos> thats good for you i am not very into ne...
1,thats good for you i am not very into new tech,<sos> i am a college student and i am a colleg...
2,i am a college student and i am a college student,<sos> i am go to gym and live on donations <eos>
3,i am go to gym and live on donations,<sos> i am a vegan and i am in the midwest <eos>
4,i am a vegan and i am in the midwest,<sos> so vegan i have dogs maybe i should told...


In [53]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parámetros
MAX_VOCAB_SIZE = 6000
MAX_SEQ_LENGTH = 25

# Tokenizadores
tokenizer_inputs = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='', lower=True, oov_token='<unk>')
tokenizer_inputs.fit_on_texts(df['question_clean'])
input_sequences = tokenizer_inputs.texts_to_sequences(df['question_clean'])

tokenizer_outputs = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='', lower=True, oov_token='<unk>')
tokenizer_outputs.fit_on_texts(df['answer_clean'])
output_sequences = tokenizer_outputs.texts_to_sequences(df['answer_clean'])

# Diccionarios de vocabulario
word2idx_inputs = tokenizer_inputs.word_index
word2idx_outputs = tokenizer_outputs.word_index

# Asegurarse de respetar el vocabulario limitado
num_words_output = min(MAX_VOCAB_SIZE, len(word2idx_outputs) + 1)

# Longitudes máximas de secuencia
max_input_len = min(MAX_SEQ_LENGTH, max(len(seq) for seq in input_sequences))
max_output_len = min(MAX_SEQ_LENGTH, max(len(seq) for seq in output_sequences))

# Padding (relleno con ceros al final)
encoder_input_sequences = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')
decoder_input_sequences = pad_sequences(output_sequences, maxlen=max_output_len, padding='post')

# Targets del decoder (one-hot desplazados)
decoder_targets = np.zeros((len(decoder_input_sequences), max_output_len, num_words_output), dtype='float32')

for i, seq in enumerate(decoder_input_sequences):
    for t in range(1, len(seq)):
        word_idx = seq[t]
        if word_idx < num_words_output:
            decoder_targets[i, t - 1, word_idx] = 1.0

# Prints informativos
print(f"Total de pares pregunta-respuesta: {len(df)}")
print(f"Vocabulario input: {min(len(word2idx_inputs), MAX_VOCAB_SIZE)}")
print(f"Vocabulario output: {num_words_output}")
print(f"Longitud máxima input: {max_input_len}")
print(f"Longitud máxima output: {max_output_len}")
print(f"Shape de decoder_targets: {decoder_targets.shape}")

# Ejemplo ilustrativo
idx = 0
print("\n🔎 Ejemplo:")
print(f"Pregunta original: {df['question_clean'].iloc[idx]}")
print(f"Secuencia tokenizada: {input_sequences[idx]}")
print(f"Secuencia padded: {encoder_input_sequences[idx]}")
print(f"Respuesta tokenizada: {output_sequences[idx]}")
print(f"Respuesta padded: {decoder_input_sequences[idx]}")


Total de pares pregunta-respuesta: 12647
Vocabulario input: 4042
Vocabulario output: 4167
Longitud máxima input: 25
Longitud máxima output: 25
Shape de decoder_targets: (12647, 25, 4167)

🔎 Ejemplo:
Pregunta original: i love iphone i just bought new iphone
Secuencia tokenizada: [2, 21, 854, 2, 39, 739, 145, 854]
Secuencia padded: [  2  21 854   2  39 739 145 854   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0]
Respuesta tokenizada: [2, 54, 28, 27, 5, 4, 6, 10, 53, 231, 154, 2231, 3]
Respuesta padded: [   2   54   28   27    5    4    6   10   53  231  154 2231    3    0
    0    0    0    0    0    0    0    0    0    0    0]


In [6]:
pip install pandas requests numpy==1.26.4 gensim==4.3.3 nltk tensorflow==2.16.1




In [54]:
import numpy as np

import gensim.downloader as api
import gensim
from tqdm import tqdm

# Parámetros
EMBEDDING_MODEL = 'fasttext-wiki-news-subwords-300'
EMBEDDING_DIM = 300

def load_fasttext_model():
    print("Cargando modelo FastText...")
    try:
        return api.load(EMBEDDING_MODEL)
    except Exception as e:
        print(f"Error al cargar FastText: {e}")
        return None

def build_embedding_matrix(tokenizer, embedding_model, vocab_size, embedding_dim):
    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    found = 0
    missing_words = []

    print("Generando matriz de embeddings...")
    for word, i in tqdm(word_index.items(), total=len(word_index), desc="Procesando palabras"):
        if i >= vocab_size:
            continue
        if word in embedding_model:
            embedding_matrix[i] = embedding_model[word]
            found += 1
        else:
            missing_words.append(word)

    print(f"\n Palabras encontradas: {found}/{len(word_index)}")
    print(f"Palabras faltantes: {len(missing_words)} (ej: {missing_words[:10]})")
    print(f"Dimensión de la matriz: {embedding_matrix.shape}")

    return embedding_matrix

# Ejecución
fasttext_model = load_fasttext_model()
embedding_matrix = build_embedding_matrix(tokenizer_inputs, fasttext_model, MAX_VOCAB_SIZE, EMBEDDING_DIM)


Cargando modelo FastText...
Generando matriz de embeddings...


Procesando palabras: 100%|██████████| 4042/4042 [00:00<00:00, 209306.12it/s]


 Palabras encontradas: 3731/4042
Palabras faltantes: 311 (ej: ['<unk>', 'convai', 'whazzup', 'buongiorno', '_', 'poyou', 'zitah', 'orhun', 'wontice', 'hesnt'])
Dimensión de la matriz: (6000, 300)





### modelo Seq2Seq con atención Luong (dot-product)

In [55]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dot, Activation, Concatenate
from tensorflow.keras.optimizers import Adam

# Parámetros
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_input_len,))
encoder_embedding = Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM,
                              weights=[embedding_matrix],
                              input_length=max_input_len,
                              trainable=False)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_output_len,))
decoder_embedding_layer = Embedding(num_words_output, latent_dim)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Atención (Luong: dot product entre encoder_outputs y decoder_outputs)
attention = Dot(axes=[2, 2])([decoder_outputs, encoder_outputs])         # (batch, dec_seq, enc_seq)
attention = Activation('softmax')(attention)                             # softmax sobre el encoder sequence
context = Dot(axes=[2,1])([attention, encoder_outputs])                 # contexto ponderado
decoder_combined_context = Concatenate(axis=-1)([context, decoder_outputs])

# Output final
output = Dense(256, activation='tanh')(decoder_combined_context)
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(output)

# Modelo final con atención
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compilar
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()



In [9]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Parámetros de entrenamiento
BATCH_SIZE = 64
EPOCHS = 30
VALIDATION_SPLIT = 0.2
MODEL_PATH = 'best_model.keras'

# Callback: detener si no mejora val_loss en N épocas
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

# Callback: guardar el mejor modelo basado en val_loss
checkpoint = ModelCheckpoint(
    filepath=MODEL_PATH,
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

# Entrenamiento
history = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=VALIDATION_SPLIT,
    callbacks=[early_stop, checkpoint],
    verbose=1
)


Epoch 1/30
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 851ms/step - accuracy: 0.6453 - loss: 3.1649
Epoch 1: val_loss improved from inf to 1.75503, saving model to best_model.keras
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 993ms/step - accuracy: 0.6455 - loss: 3.1589 - val_accuracy: 0.7178 - val_loss: 1.7550
Epoch 2/30
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 834ms/step - accuracy: 0.7259 - loss: 1.7033
Epoch 2: val_loss improved from 1.75503 to 1.58937, saving model to best_model.keras
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 965ms/step - accuracy: 0.7260 - loss: 1.7030 - val_accuracy: 0.7509 - val_loss: 1.5894
Epoch 3/30
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 849ms/step - accuracy: 0.7496 - loss: 1.5334
Epoch 3: val_loss improved from 1.58937 to 1.48444, saving model to best_model.keras
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 980ms

In [10]:
from google.colab import files

# Descargar el modelo guardado
files.download(MODEL_PATH)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#from google.colab import files

# Subir archivos desde tu computadora
#uploaded = files.upload()

### Inferencia

In [56]:
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

In [57]:

decoder_input_single = Input(shape=(1,))
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
encoder_outputs_input = Input(shape=(max_input_len, latent_dim))

decoder_embedding_inf = decoder_embedding_layer(decoder_input_single)


decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding_inf,
    initial_state=[decoder_state_input_h, decoder_state_input_c]
)


attention = Dot(axes=[2, 2])([decoder_outputs, encoder_outputs_input])
attention = Activation('softmax')(attention)
context = Dot(axes=[2, 1])([attention, encoder_outputs_input])
decoder_combined_context = Concatenate(axis=-1)([context, decoder_outputs])

# Salida densa
decoder_tanh = Dense(256, activation='tanh')(decoder_combined_context)
decoder_output_probs = decoder_dense(decoder_tanh)

# Modelo final de inferencia
decoder_model = Model(
    [decoder_input_single, decoder_state_input_h, decoder_state_input_c, encoder_outputs_input],
    [decoder_outputs, state_h, state_c]
)

In [58]:
reverse_word2idx_outputs = {idx: word for word, idx in word2idx_outputs.items()}


In [59]:
from collections import Counter

def decode_sequence_beam_super(input_seq, beam_width=3, max_repeat=3, min_prob=1e-6, length_penalty_alpha=0.6):
    """
    Decodifica una secuencia de entrada utilizando búsqueda Beam Search con penalización por longitud y control de repeticiones.

    Args:
        input_seq (np.array): Secuencia de entrada para el codificador (encoder).
        beam_width (int): Número de caminos alternativos a mantener en cada paso de decodificación.
        max_repeat (int): Número máximo de veces que una palabra puede repetirse antes de ser eliminada.
        min_prob (float): Probabilidad mínima permitida para considerar una predicción.
        length_penalty_alpha (float): Exponente de penalización por longitud. Un valor mayor favorece secuencias más largas.

    Returns:
        str: Secuencia de palabras decodificada.
    """

    # Ejecuta el codificador y obtiene la salida junto con los estados iniciales
    enc_outs, h, c = encoder_model.predict(input_seq, verbose=0)

    # Inicializa la lista de secuencias con el token de inicio (<sos>)
    sequences = [([word2idx_outputs['<sos>']], 0.0, h, c)]

    # Itera hasta alcanzar la longitud máxima de salida permitida
    for _ in range(max_output_len):
        all_candidates = []

        # Evalúa cada secuencia actual en el beam
        for seq, score, h, c in sequences:
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = seq[-1]  # Última palabra decodificada

            # Predice el siguiente token y actualiza los estados
            output_tokens, h_new, c_new = decoder_model.predict([target_seq, h, c, enc_outs], verbose=0)
            output_probs = output_tokens[0, -1, :]  # Distribución de probabilidad para el siguiente token

            # Selecciona los índices con mayor probabilidad (beam search)
            top_indices = output_probs.argsort()[-beam_width:][::-1]

            # Genera nuevos candidatos a partir de los mejores tokens
            for idx in top_indices:
                if idx not in reverse_word2idx_outputs:
                    continue
                word = reverse_word2idx_outputs[idx]
                prob = output_probs[idx]

                if prob < min_prob:
                    continue  # Descarta tokens poco probables

                new_seq = seq + [idx]

                # Si se genera el token de fin (<eos>), se decodifica la secuencia actual
                if word == '<eos>':
                    decoded = [reverse_word2idx_outputs.get(i, "<UNK>") for i in new_seq[1:-1]]  # omite <sos> y <eos>
                    return ' '.join(decoded)

                # Calcula penalización por longitud (para evitar secuencias muy cortas)
                length_penalty = ((5 + len(new_seq)) / 6) ** length_penalty_alpha
                candidate_score = (score - np.log(prob + 1e-10)) / length_penalty

                all_candidates.append((new_seq, candidate_score, h_new, c_new))

        if not all_candidates:
            break  # No hay más candidatos válidos

        # Selecciona los beam_width mejores candidatos
        sequences = sorted(all_candidates, key=lambda tup: tup[1])[:beam_width]

    # Si no se encuentra <eos>, se toma la mejor secuencia final
    final_sequence = sequences[0][0][1:]  # omite el <sos>
    decoded_words = [reverse_word2idx_outputs.get(i, "<UNK>") for i in final_sequence]

    # Elimina repeticiones excesivas de palabras
    word_counts = Counter(decoded_words)
    most_common_word, count = word_counts.most_common(1)[0]
    if count > max_repeat:
        decoded_words = list(dict.fromkeys(decoded_words))  # preserva el orden

    # Elimina triples consecutivos (ej: "hola hola hola")
    cleaned = []
    for w in decoded_words:
        if len(cleaned) < 2 or not (w == cleaned[-1] == cleaned[-2]):
            cleaned.append(w)

    return ' '.join(cleaned)


In [70]:
def answer_question_beam_super(question):
    question_clean = clean_text(question)
    seq = tokenizer_inputs.texts_to_sequences([question_clean])
    pad_seq = pad_sequences(seq, maxlen=max_input_len, padding='post')
    return decode_sequence_beam_super(pad_seq)

# Probar
test_questions = [
    "Do you like tv or radio?",
    "Why did you choose to become a vegan?"
]

for q in test_questions:
    print(f"Question: {q}")
    print(f"Answer: {answer_question_beam_super(q)}\n")



🧍 Question: Do you like tv or radio?
🤖 Answer: tv

🧍 Question: Why did you choose to become a vegan?
🤖 Answer: good end at see drive text gym living little

