## **English → Tamil Translation using Seq2Seq (LSTM)**

### **Import Libraries**

In [156]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from gensim.models import Word2Vec


### **Load Dataset**

In [159]:
train = pd.read_csv("engtamilTrain.csv")
train = train.drop(["Unnamed: 0"], axis=1)

english_sentences = train["en"].astype(str).head(2000)
tamil_sentences   = train["ta"].astype(str).head(2000)


In [165]:
english_sentences.tail()

1995           Sreya is paired with Rajini in 'Sivaji'.\n
1996    It has opposed every form of Sinhalese chauvin...
1997    Yet his meat in his bowels is turned, it is th...
1998    Eight of the 26 are currently employed on full...
1999    And changed his prison garments: and he did ea...
Name: en, dtype: object

In [167]:
tamil_sentences.tail()

1995     'சிவாஜி'யில் ரஜினி ஜோடியாக ஷ்ரேயா நடிக்கிறார்.\n
1996    தமிழீழ விடுதலைப் புலிகளின் வங்குரோத்தானதும் ப...
1997    அவன் போஜனம் அவன் குடல்களில் மாறி, அவனுக்குள் வ...
1998    இந்த 26 பேரில் 8 பேர் முழுச் சம்பளத்தில் மீளச்...
1999    அவனுடைய சிறைச்சாலை வஸ்திரங்களை மாற்றினான்; அவன...
Name: ta, dtype: object

### **Add SOS> and <EOS (ONLY to Tamil)**

In [170]:
def add_sos_eos(sentences):
    return ["<SOS> " + s + " <EOS>" for s in sentences]

tamil_sentences = add_sos_eos(tamil_sentences)


### **Tokenization (Words → Numbers)**

In [173]:
english_tokenizer = Tokenizer(filters="")
tamil_tokenizer   = Tokenizer(filters="")

english_tokenizer.fit_on_texts(english_sentences) #eg-[1,2,3,4,5]
tamil_tokenizer.fit_on_texts(tamil_sentences)

english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
tamil_sequences   = tamil_tokenizer.texts_to_sequences(tamil_sentences)#eg-[2,3,4,2]

english_vocab_size = len(english_tokenizer.word_index) + 1
tamil_vocab_size   = len(tamil_tokenizer.word_index) + 1


In [175]:
print(english_vocab_size)
print(tamil_vocab_size)

11162
17357


### **Padding (Fix Sequence Length)**

In [178]:
max_input_len  = 20
max_output_len = 20

encoder_input_sequences = pad_sequences(
    english_sequences,
    maxlen=max_input_len,
    padding="post"
)

decoder_full_sequences = pad_sequences(
    tamil_sequences,
    maxlen=max_output_len,
    padding="post"
)


### **Teacher Forcing (Decoder Input & Target)**

In [181]:
decoder_input_sequences  = decoder_full_sequences[:, :-1]
decoder_target_sequences = decoder_full_sequences[:, 1:]


In [183]:
decoder_target_sequences

array([[ 3588,    63,  3589, ...,     0,     0,     0],
       [ 3595,    54,   679, ...,     2,     0,     0],
       [ 3604,  3605,   875, ...,     0,     0,     0],
       ...,
       [   20, 17338,    20, ...,     0,     0,     0],
       [    6, 17344,   497, ...,     0,     0,     0],
       [  171, 17350, 17351, ...,     0,     0,     0]])

### **Load Word2Vec Models**

In [186]:
# Already created engmodel and tammodel on wordembedding practice notebook and saved
# Import ting from there
eng_model = Word2Vec.load("engmodel.bin")
tam_model = Word2Vec.load("tammodel.bin")


### **Create Embedding Matrices**

In [189]:
def create_embedding_matrix(w2v_model, tokenizer, vocab_size):
    embedding_dim = w2v_model.vector_size
    matrix = np.zeros((vocab_size, embedding_dim))

    for word, idx in tokenizer.word_index.items():
        if word in w2v_model.wv:
            matrix[idx] = w2v_model.wv[word]

    return matrix

eng_embedding_matrix = create_embedding_matrix(
    eng_model, english_tokenizer, english_vocab_size
)

tam_embedding_matrix = create_embedding_matrix(
    tam_model, tamil_tokenizer, tamil_vocab_size
)


In [191]:
eng_embedding_matrix.shape

(11162, 100)

In [193]:
tam_embedding_matrix.shape

(17357, 100)

def create_seq2seq_model(input_vocab_size, output_vocab_size, input_seq_length, output_seq_length, hidden_units, eng_embedding_matrix, tam_embedding_matrix):
    # Encoder
    encoder_inputs = Input(shape=(input_seq_length,))
    encoder_embedding = Embedding(input_vocab_size, hidden_units, weights=[eng_embedding_matrix], trainable=False)(encoder_inputs)
    encoder_lstm, encoder_state_h, encoder_state_c = LSTM(hidden_units, return_state=True)(encoder_embedding)

    # Decoder
    decoder_inputs = Input(shape=(output_seq_length,))
    decoder_embedding = Embedding(output_vocab_size, hidden_units, weights=[tam_embedding_matrix], trainable=False)(decoder_inputs)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model


### **Build Seq2Seq Model (Encoder–Decoder)**

In [199]:
latent_dim = 100

# Encoder
encoder_inputs = Input(shape=(max_input_len,))
encoder_embed = Embedding(
    english_vocab_size,
    latent_dim,
    weights=[eng_embedding_matrix],
    trainable=False
)(encoder_inputs)

encoder_outputs, state_h, state_c = LSTM(
    latent_dim, return_state=True
)(encoder_embed)

# Decoder
decoder_inputs = Input(shape=(max_output_len - 1,))
decoder_embed = Embedding(
    tamil_vocab_size,
    latent_dim,
    weights=[tam_embedding_matrix],
    trainable=False
)(decoder_inputs)

decoder_lstm = LSTM(
    latent_dim,
    return_sequences=True,
    return_state=True
)

decoder_outputs, _, _ = decoder_lstm(
    decoder_embed,
    initial_state=[state_h, state_c]
)

decoder_dense = Dense(tamil_vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Training Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


### **Compile & Train Model**

In [202]:
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

history = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_target_sequences,
    batch_size=32,
    epochs=50,
    validation_split=0.2
)


Epoch 1/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 252ms/step - accuracy: 0.2332 - loss: 9.3689 - val_accuracy: 0.2612 - val_loss: 7.6371
Epoch 2/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 248ms/step - accuracy: 0.2423 - loss: 7.0159 - val_accuracy: 0.2612 - val_loss: 7.4407
Epoch 3/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 256ms/step - accuracy: 0.2450 - loss: 6.7370 - val_accuracy: 0.2612 - val_loss: 7.5711
Epoch 4/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 237ms/step - accuracy: 0.2494 - loss: 6.6375 - val_accuracy: 0.2646 - val_loss: 7.6433
Epoch 5/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 230ms/step - accuracy: 0.2440 - loss: 6.6407 - val_accuracy: 0.2634 - val_loss: 7.7364
Epoch 6/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 230ms/step - accuracy: 0.2519 - loss: 6.5405 - val_accuracy: 0.2642 - val_loss: 7.7825
Epoch 7/50
[1m50/50[

### **Inference Models (REAL TRANSLATION)**

In [205]:
encoder_model = Model(
    encoder_inputs,
    [state_h, state_c]
)
# Decoder Inference model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_embed,
    initial_state=decoder_states_inputs
)

decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs, state_h_dec, state_c_dec]
)


### **Translation Function**

In [208]:
def translate_sentence(sentence):
    seq = english_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_input_len, padding="post")

    state_h, state_c = encoder_model.predict(seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tamil_tokenizer.word_index['<sos>']

    decoded_sentence = []

    for _ in range(max_output_len):
        output_tokens, h, c = decoder_model.predict(
            [target_seq, state_h, state_c]
        )

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tamil_tokenizer.index_word.get(sampled_token_index)

        if sampled_word == '<EOS>' or sampled_word is None:
            break

        decoded_sentence.append(sampled_word)

        target_seq[0, 0] = sampled_token_index
        state_h, state_c = h, c

    return " ".join(decoded_sentence)


In [210]:
list(tamil_tokenizer.word_index.keys())[:20]


['<sos>',
 '<eos>',
 'மற்றும்',
 'ஒரு',
 'என்று',
 'இந்த',
 'அவர்',
 'அமெரிக்க',
 'அரசியல்',
 'என்ற',
 'அவர்கள்',
 'நான்',
 'இருந்து',
 'என',
 'அது',
 'உள்ள',
 'அந்த',
 'இது',
 'தனது',
 'அவன்']

In [214]:

print(translate_sentence("i love english"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
அன்பின் <eos> <eos>


“Basic Seq2Seq without attention compresses the entire input into a single context vector, causing information loss. As a result, the decoder often predicts <EOS> early. **This is a known limitation, and attention mechanisms were introduced to solve this.”**