In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Embedding, Dense, Bidirectional
import matplotlib.pyplot as plt
import nltk
from nltk.translate.bleu_score import sentence_bleu

In [3]:
data = [
    ("Hello", "नमस्ते"),
    ("How are you?", "आप कैसे हैं?"),
    ("Good morning", "सुप्रभात"),
    ("I love programming", "मुझे प्रोग्रामिंग पसंद है"),
    ("See you soon", "जल्द ही मिलते हैं")
]

In [4]:
# Convert into DataFrame
df = pd.DataFrame(data, columns=['English', 'Hindi'])

In [7]:
# Lowercase the text
df['English'] = df['English'].str.lower()
df['Hindi'] = df['Hindi'].str.lower()

In [8]:
df.head()

Unnamed: 0,English,Hindi
0,hello,नमस्ते
1,how are you?,आप कैसे हैं?
2,good morning,सुप्रभात
3,i love programming,मुझे प्रोग्रामिंग पसंद है
4,see you soon,जल्द ही मिलते हैं


### Tokenization and Padding

In [9]:
# Tokenizers for English and Hindi
eng_tokenizer = Tokenizer()
hin_tokenizer = Tokenizer()

# Fit tokenizers
eng_tokenizer.fit_on_texts(df['English'])
hin_tokenizer.fit_on_texts(df['Hindi'])

In [10]:
# Convert text to sequences
eng_sequences = eng_tokenizer.texts_to_sequences(df['English'])
hin_sequences = hin_tokenizer.texts_to_sequences(df['Hindi'])

# Vocabulary sizes
eng_vocab_size = len(eng_tokenizer.word_index) + 1
hin_vocab_size = len(hin_tokenizer.word_index) + 1

In [11]:
# Padding sequences
max_len_eng = max(len(seq) for seq in eng_sequences)
max_len_hin = max(len(seq) for seq in hin_sequences)

eng_sequences = pad_sequences(eng_sequences, maxlen=max_len_eng, padding='post')
hin_sequences = pad_sequences(hin_sequences, maxlen=max_len_hin, padding='post')

In [12]:
print("English Vocabulary Size:", eng_vocab_size)
print("Hindi Vocabulary Size:", hin_vocab_size)

English Vocabulary Size: 12
Hindi Vocabulary Size: 13


## Build Seq2Seq Model (RNN, LSTM, Bi-LSTM)

In [16]:
def build_model(cell_type='RNN'):
    embedding_dim = 64
    units = 128

    # Encoder
    encoder_inputs = Input(shape=(max_len_eng,))
    enc_emb = Embedding(eng_vocab_size, embedding_dim)(encoder_inputs)

    if cell_type == 'RNN':
        encoder = tf.keras.layers.SimpleRNN(units, return_state=True)
        encoder_outputs, state_h = encoder(enc_emb)
        encoder_states = [state_h]
    elif cell_type == 'LSTM':
        encoder = LSTM(units, return_state=True)
        encoder_outputs, state_h, state_c = encoder(enc_emb)
        encoder_states = [state_h, state_c]
    else:  # Bi-LSTM
        encoder = Bidirectional(LSTM(units, return_state=True, return_sequences=False))
        encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(enc_emb)
        state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
        state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
        encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(max_len_hin,))
    dec_emb_layer = Embedding(hin_vocab_size, embedding_dim)
    dec_emb = dec_emb_layer(decoder_inputs)

    if cell_type == 'RNN':
        decoder = tf.keras.layers.SimpleRNN(units, return_sequences=True, return_state=True)
        decoder_outputs, _ = decoder(dec_emb, initial_state=encoder_states)
    elif cell_type == 'LSTM':
        decoder = LSTM(units, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder(dec_emb, initial_state=encoder_states)
    else:  # Bi-LSTM
        decoder = LSTM(units * 2, return_sequences=True, return_state=True)  # Double units to match encoder's concatenation
        decoder_outputs, _, _ = decoder(dec_emb, initial_state=encoder_states)

    decoder_dense = Dense(hin_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model


### Train the Model

In [18]:
def train_model(cell_type='RNN'):
    model = build_model(cell_type)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    decoder_input_data = np.roll(hin_sequences, shift=1, axis=1)  # Shift for teacher forcing
    decoder_input_data[:, 0] = 0  # First token is always 0 (start token)

    model.fit([eng_sequences, decoder_input_data], hin_sequences, batch_size=16, epochs=50, verbose=1)
    return model

# Train RNN, LSTM, and Bi-LSTM models
rnn_model = train_model('RNN')
lstm_model = train_model('LSTM')
bilstm_model = train_model('Bi-LSTM')

Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0000e+00 - loss: 2.5787
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.2500 - loss: 2.4926
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.6000 - loss: 2.4063
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.8500 - loss: 2.3162
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step - accuracy: 0.8500 - loss: 2.2194
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step - accuracy: 0.9000 - loss: 2.1131
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.9500 - loss: 1.9953
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - accuracy: 0.9000 - loss: 1.8647
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [19]:
def evaluate_translation(model, input_text):
    sequence = eng_tokenizer.texts_to_sequences([input_text.lower()])
    sequence = pad_sequences(sequence, maxlen=max_len_eng, padding='post')

    prediction = model.predict([sequence, np.zeros((1, max_len_hin))])  # Empty decoder input
    predicted_sequence = np.argmax(prediction, axis=-1)[0]

    output_words = [word for i in predicted_sequence if (word := hin_tokenizer.index_word.get(i))]
    return ' '.join(output_words)

# Example translations
test_sentences = ["Hello", "How are you?", "Good morning"]

for sentence in test_sentences:
    print(f"\n**Input:** {sentence}")
    print(f"**RNN Output:** {evaluate_translation(rnn_model, sentence)}")
    print(f"**LSTM Output:** {evaluate_translation(lstm_model, sentence)}")
    print(f"**Bi-LSTM Output:** {evaluate_translation(bilstm_model, sentence)}")


**Input:** Hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step
**RNN Output:** नमस्ते
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 362ms/step
**LSTM Output:** 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 554ms/step
**Bi-LSTM Output:** नमस्ते

**Input:** How are you?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
**RNN Output:** आप कैसे हैं
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
**LSTM Output:** 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
**Bi-LSTM Output:** आप हैं

**Input:** Good morning
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
**RNN Output:** सुप्रभात
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
**LSTM Output:** 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
**Bi-LSTM Output:** नमस्ते


### Compute BLEU Score

In [20]:
def compute_bleu(model, test_pairs):
    bleu_scores = []
    for eng, hin in test_pairs:
        predicted = evaluate_translation(model, eng)
        reference = [hin.split()]  # Reference translation as a list of words
        candidate = predicted.split()
        bleu_scores.append(sentence_bleu(reference, candidate))
    return np.mean(bleu_scores)

print("\nBLEU Scores:")
print(f"RNN Model: {compute_bleu(rnn_model, data)}")
print(f"LSTM Model: {compute_bleu(lstm_model, data)}")
print(f"Bi-LSTM Model: {compute_bleu(bilstm_model, data)}")



BLEU Scores:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
RNN Model: 0.4
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
LSTM Model: 3.703621506333808e-232
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Bi-LSTM Model

### Compare Teacher Forcing vs. No Teacher Forcing

In [21]:
def train_without_teacher_forcing(cell_type='RNN'):
    model = build_model(cell_type)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    model.fit([eng_sequences, hin_sequences], hin_sequences, batch_size=16, epochs=50, verbose=1)  # No teacher forcing
    return model

rnn_no_tf_model = train_without_teacher_forcing('RNN')

Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.1500 - loss: 2.5682
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.4000 - loss: 2.4726
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.8500 - loss: 2.3764
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.9000 - loss: 2.2759
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.9000 - loss: 2.1679
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.9000 - loss: 2.0497
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.9500 - loss: 1.9198
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.9500 - loss: 1.7788
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

## Questions and Answers


1.   **Difference between RNN, LSTM, and Bi-LSTM in Machine Translation**

    *   RNN: Processes sequences sequentially but suffers from vanishing gradient issues.
    *   LSTM: Uses gates (input, forget, output) to retain long-term dependencies.
    *   Bi-LSTM: Processes sequences in both forward and backward directions, improving context understanding.


2.   **Why is LSTM preferred over vanilla RNNs for Seq2Seq tasks?**

    *   LSTM solves the vanishing gradient problem, allowing better retention of long-term dependencies.
    *   Helps in handling long sequences where RNNs struggle.


3.   **How does the bidirectional nature of Bi-LSTM improve translation accuracy?**

    *   Captures both past and future context, making translations more accurate.
    *   Provides better context awareness, especially for longer sentences.

4.   **What is teacher forcing, and how does it impact training?**

    *   Teacher Forcing: During training, instead of using the predicted word as the next input, the actual target word is fed.
    *   Impact:

        *   Faster convergence.
        *   Reduces error accumulation.

5. **How is BLEU Score calculated and its significance?**

    * BLEU Score (Bilingual Evaluation Understudy):

        *   Compares machine-translated output with reference translations.
        *   Uses n-gram precision and applies brevity penalty for short translations.

    * Significance:

        *   Measures how closely a model’s output matches human translations.
        *   Higher BLEU = Better translation quality.











