<a href="https://colab.research.google.com/github/jpbeaud/language/blob/main/essai1_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow
!pip install keras-preprocessing

Collecting keras-preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras-preprocessing
Successfully installed keras-preprocessing-1.1.2


Création & préparation du modèle

In [2]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

# Définition des paramètres
num_encoder_tokens = 100
num_decoder_tokens = 100
latent_dim = 256

# Encodeur
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]
print(encoder_inputs)
# Décodeur
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Modèle Seq2Seq
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()


<KerasTensor shape=(None, None, 100), dtype=float32, sparse=False, name=keras_tensor>


Préparation des données

In [3]:

from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import numpy as np



# Exemples de données (à remplacer par tes propres données)
input_texts = ["bonjour", "salut"]
target_texts = ["hello", "hi"]

# Tokenisation des textes
input_tokenizer = Tokenizer(num_words=num_encoder_tokens)
print("num_encorder_tokens = ", num_encoder_tokens)
print("num_decoder_tokens = ", num_decoder_tokens)
input_tokenizer.fit_on_texts(input_texts)
encoder_input_data = input_tokenizer.texts_to_sequences(input_texts)
encoder_input_data = pad_sequences(encoder_input_data, padding='post')

target_tokenizer = Tokenizer(num_words=num_decoder_tokens)
target_tokenizer.fit_on_texts(target_texts)
decoder_input_data = target_tokenizer.texts_to_sequences(target_texts)
decoder_input_data = pad_sequences(decoder_input_data, padding='post')


num_encorder_tokens =  100
num_decoder_tokens =  100


Entrainement du modèle avec les données préparées

In [4]:
# Convertir les données cibles en une représentation à une dimension
decoder_target_data = np.zeros((len(target_texts), max([len(t) for t in decoder_input_data]), num_decoder_tokens))
for i, seqs in enumerate(decoder_input_data):
    #print(i, seqs)
    for t, token in enumerate(seqs):
        #print(t, token)
        if token.all() > 0:
            decoder_target_data[i, t, token] = 1.0
# Convertir les données sources en une représentation à une dimension
encoder_input_data = np.zeros((len(input_texts), max([len(t) for t in encoder_input_data]), num_encoder_tokens))
for i, seqs in enumerate(encoder_input_data):
    #print(i, seqs)
    for t, token in enumerate(seqs):
        #print(t, token)
        if token.all() > 0:
            encoder_input_data[i, t, token] = 1.0
decoder_input_data = np.zeros((len(input_texts), max([len(t) for t in decoder_input_data]), num_decoder_tokens))
for i, seqs in enumerate(decoder_input_data):
    #print(i, seqs)
    for t, token in enumerate(seqs):
        #print(t, token)
        if token.all() > 0:
            decoder_input_data[i, t, token] = 1.0

# Entraîner le modèle
print(type(encoder_input_data))
print(decoder_input_data.shape)
print(decoder_target_data.shape)
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,batch_size=64,epochs=100,validation_split=0.2)


<class 'numpy.ndarray'>
(2, 1, 100)
(2, 1, 100)
Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 4.6052 - val_loss: 4.6074
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 4.5767 - val_loss: 4.6091
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 4.5544 - val_loss: 4.6107
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 4.5339 - val_loss: 4.6122
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - loss: 4.5140 - val_loss: 4.6138
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step - loss: 4.4943 - val_loss: 4.6153
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - loss: 4.4744 - val_loss: 4.6168
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step - loss: 4.4542 - val_loss: 4.6183
Epoch 

<keras.src.callbacks.history.History at 0x791c7883ad70>

Utiliser le modèle pour la prédiction

In [5]:
# Définir les modèles encodeur et décodeur pour la prédiction
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Fonction pour générer les séquences
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_tokenizer.word_index['<start>']] = 1.0

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = target_tokenizer.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_char

        if (sampled_char == '<end>' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0
        states_value = [h, c]

    return decoded_sentence

# Prédiction pour une nouvelle séquence d'entrée
new_input_seq = encoder_input_data[0:1]
decoded_sentence = decode_sequence(new_input_seq)
print('Decoded sentence:', decoded_sentence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step


KeyError: '<start>'