<a href="https://colab.research.google.com/github/harshith-clg/nlp_07/blob/main/Nlp7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

(a) Data Preprocessing



In [4]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data
data = [("hello", "bonjour"), ("how are you", "comment ça va"), ("I am fine", "je vais bien"),
        ("what is your name", "comment tu t'appelles"), ("my name is", "je m'appelle"),
        ("thank you", "merci"), ("goodbye", "au revoir")]

# Split English and French sentences
english_texts, french_texts = zip(*data)

# Tokenize English text
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_texts)
eng_sequences = eng_tokenizer.texts_to_sequences(english_texts)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_max_length = max(len(seq) for seq in eng_sequences)

# Tokenize French text
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(french_texts)
fr_sequences = fr_tokenizer.texts_to_sequences(french_texts)
fr_vocab_size = len(fr_tokenizer.word_index) + 1
fr_max_length = max(len(seq) for seq in fr_sequences)

# Pad sequences
eng_sequences = pad_sequences(eng_sequences, maxlen=eng_max_length, padding='post')
fr_sequences = pad_sequences(fr_sequences, maxlen=fr_max_length, padding='post')

(b) Build Seq2Seq Model



In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Encoder
encoder_inputs = Input(shape=(eng_max_length,))
encoder_embedding = Embedding(eng_vocab_size, 256)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(256, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(fr_max_length,))
decoder_embedding = Embedding(fr_vocab_size, 256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

(c) Preparing the Data for Training



In [11]:
# Shift decoder input sequences by one timestep to get target sequences
fr_target_sequences = np.expand_dims(fr_sequences, -1)

# Prepare training input and output
X_train = [eng_sequences, fr_sequences]
Y_train = fr_target_sequences

 (d) Train the model on the dataset



In [7]:
# Train the model
model.fit(X_train, Y_train, batch_size=2, epochs=100, validation_split=0.2)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 257ms/step - accuracy: 0.2167 - loss: 2.6341 - val_accuracy: 0.5000 - val_loss: 2.5958
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.5708 - loss: 2.5536 - val_accuracy: 0.5000 - val_loss: 2.5248
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.5917 - loss: 2.4693 - val_accuracy: 0.5000 - val_loss: 2.4273
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.5458 - loss: 2.3249 - val_accuracy: 0.5000 - val_loss: 2.2997
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.5250 - loss: 2.1464 - val_accuracy: 0.5000 - val_loss: 2.1543
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.4750 - loss: 2.1146 - val_accuracy: 0.5000 - val_loss: 2.1189
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x782a1dfecfa0>

(e) Inference Setup for Translation



In [8]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

(f) Translate New Sentences



In [10]:
def translate_sentence(input_text):
    # Tokenize and pad input text
    input_seq = eng_tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=eng_max_length, padding='post')

    # Encode input
    states_value = encoder_model.predict(input_seq)

    # Initialize target sequence with start token
    target_seq = np.zeros((1, 1))
    translation = ''

    for _ in range(fr_max_length):
        # Predict the next word
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample token with highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = fr_tokenizer.index_word.get(sampled_token_index, None)

        # If end of sentence, break
        if sampled_word == 'end' or sampled_word is None:
            break
        translation += sampled_word + ' '

        # Update target sequence and states
        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return translation.strip()