In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1. Sample dataset
english_texts = [
    'hello', 'how are you', 'good morning', 'good night', 'thank you',
    'please', 'yes', 'no', 'what is your name', 'my name is John',
    'where are you from', 'i am from india', 'do you speak english',
    'i love you', 'see you later', 'i am fine', 'welcome',
    'goodbye', 'excuse me', 'i don’t understand'
]

french_texts = [
    'salut', 'comment ça va', 'bonjour', 'bonne nuit', 'merci',
    's’il vous plaît', 'oui', 'non', 'comment tu t’appelles', 'je m’appelle John',
    'd’où viens-tu', 'je viens d’Inde', 'parles-tu anglais',
    'je t’aime', 'à plus tard', 'je vais bien', 'bienvenue',
    'au revoir', 'excusez-moi', 'je ne comprends pas'
]

# Add special tokens
french_texts = ['<start> ' + text + ' <end>' for text in french_texts]

# 2. Tokenization
eng_tokenizer = Tokenizer()
fre_tokenizer = Tokenizer(filters='')  # Preserve <start>, <end>

eng_tokenizer.fit_on_texts(english_texts)
fre_tokenizer.fit_on_texts(french_texts)

eng_sequences = eng_tokenizer.texts_to_sequences(english_texts)
fre_sequences = fre_tokenizer.texts_to_sequences(french_texts)

max_encoder_seq_length = max(len(seq) for seq in eng_sequences)
max_decoder_seq_length = max(len(seq) for seq in fre_sequences)

encoder_input_data = pad_sequences(eng_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences([seq[:-1] for seq in fre_sequences], maxlen=max_decoder_seq_length - 1, padding='post')
decoder_target_data = pad_sequences([seq[1:] for seq in fre_sequences], maxlen=max_decoder_seq_length - 1, padding='post')

# 3. Vocabulary sizes
num_encoder_tokens = len(eng_tokenizer.word_index) + 1
num_decoder_tokens = len(fre_tokenizer.word_index) + 1

# 4. Model Parameters
embedding_dim = 64
latent_dim = 256

# 5. Encoder Model
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, embedding_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# 6. Decoder Model
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# 7. Define Full Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Expand target data for sparse loss
decoder_target_data = np.expand_dims(decoder_target_data, -1)

# 8. Train the Model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=16, epochs=300, verbose=1)

# 9. Inference Encoder Model
encoder_model = Model(encoder_inputs, encoder_states)

# 10. Inference Decoder Model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_states2 = [state_h2, state_c2]

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

# 11. Reverse Token Dictionaries
reverse_fre_index = dict((i, word) for word, i in fre_tokenizer.word_index.items())
fre_word_index = fre_tokenizer.word_index

# 12. Decode Sequence Function
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.array([[fre_word_index['<start>']]])
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_fre_index.get(sampled_token_index, '')

        if sampled_word == '<end>' or len(decoded_sentence.split()) > max_decoder_seq_length:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return decoded_sentence.strip()

# 13. Translate Function
def translate(input_text):
    input_seq = eng_tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_encoder_seq_length, padding='post')
    return decode_sequence(input_seq)

# 14. User Interface
while True:
    input_text = input("\nEnter an English sentence to translate to French (or type 'exit'): ")
    if input_text.lower() == 'exit':
        print("Exiting translator...")
        break
    translation = translate(input_text)
    print("French Translation:", translation)


Epoch 1/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 49ms/step - accuracy: 0.0642 - loss: 3.6846
Epoch 2/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.3742 - loss: 3.6105
Epoch 3/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.3742 - loss: 3.5052
Epoch 4/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.3825 - loss: 3.2623
Epoch 5/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.3700 - loss: 2.7097
Epoch 6/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.3742 - loss: 2.2813
Epoch 7/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.3825 - loss: 2.1508
Epoch 8/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.3742 - loss: 2.1621
Epoch 9/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3