## **English → Tamil Translation using Seq2Seq (LSTM)**

### **Import Libraries**

In [13]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from gensim.models import Word2Vec


### **Load Dataset**

In [39]:
train = pd.read_csv("engtamilTrain.csv")
train = train.drop(["Unnamed: 0"], axis=1)

english_sentences = train["en"].astype(str).head(1000)
tamil_sentences   = train["ta"].astype(str).head(1000)


In [43]:
english_sentences.head()

0    MMA vice president Qazi Hussain Ahmad declared...
1    Information has surfaced in recent years sugge...
2    And Azor begat Sadoc; and Sadoc begat Achim; a...
3    She says she knows what is going on, but can d...
4    And be it indeed that I have erred, my error r...
Name: en, dtype: object

In [45]:
tamil_sentences.head()

0    MMA கட்சியின் துணைத்தலைவர் க்வாஸி ஹுசேன் அகமத்...
1    சமீபகாலத்தில் சில தகவல்கள் யூலியஸ் ரோசன்பேர்க...
2    ஆசோர் சாதோக்கைப் பெற்றான்; சாதோக்கு ஆகீமைப் பெ...
3    என்ன நடக்கிறது என்பது தமக்கு தெரியும் என்றும் ...
4    நான் தப்பிநடந்தது மெய்யானாலும், என் தப்பிதம் எ...
Name: ta, dtype: object

### **Add SOS> and <EOS (ONLY to Tamil)**

In [47]:
def add_sos_eos(sentences):
    return ["<SOS> " + s + " <EOS>" for s in sentences]

tamil_sentences = add_sos_eos(tamil_sentences)


### **Tokenization (Words → Numbers)**

In [52]:
english_tokenizer = Tokenizer(filters="")
tamil_tokenizer   = Tokenizer(filters="")

english_tokenizer.fit_on_texts(english_sentences) #eg-[1,2,3,4,5]
tamil_tokenizer.fit_on_texts(tamil_sentences)

english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
tamil_sequences   = tamil_tokenizer.texts_to_sequences(tamil_sentences)#eg-[2,3,4,2]

english_vocab_size = len(english_tokenizer.word_index) + 1
tamil_vocab_size   = len(tamil_tokenizer.word_index) + 1


In [59]:
print(english_vocab_size)
print(tamil_vocab_size)

6977
9922


### **Padding (Fix Sequence Length)**

In [61]:
max_input_len  = 20
max_output_len = 20

encoder_input_sequences = pad_sequences(
    english_sequences,
    maxlen=max_input_len,
    padding="post"
)

decoder_full_sequences = pad_sequences(
    tamil_sequences,
    maxlen=max_output_len,
    padding="post"
)


### **Teacher Forcing (Decoder Input & Target)**

In [64]:
decoder_input_sequences  = decoder_full_sequences[:, :-1]
decoder_target_sequences = decoder_full_sequences[:, 1:]


In [68]:
decoder_target_sequences

array([[1836,   44, 1837, ...,    0,    0,    0],
       [1843,   69,  886, ...,    2,    0,    0],
       [1854, 1855,  887, ...,    0,    0,    0],
       ...,
       [9891,   86,  266, ..., 9899, 9900,    2],
       [9905, 9906, 9907, ..., 1036, 9919,    2],
       [ 195,   25,  263, ...,    0,    0,    0]])

### **Load Word2Vec Models**

In [76]:
# Already created engmodel and tammodel on wordembedding practice notebook and saved
# Import ting from there
eng_model = Word2Vec.load("engmodel.bin")
tam_model = Word2Vec.load("tammodel.bin")


### **Create Embedding Matrices**

In [78]:
def create_embedding_matrix(w2v_model, tokenizer, vocab_size):
    embedding_dim = w2v_model.vector_size
    matrix = np.zeros((vocab_size, embedding_dim))

    for word, idx in tokenizer.word_index.items():
        if word in w2v_model.wv:
            matrix[idx] = w2v_model.wv[word]

    return matrix

eng_embedding_matrix = create_embedding_matrix(
    eng_model, english_tokenizer, english_vocab_size
)

tam_embedding_matrix = create_embedding_matrix(
    tam_model, tamil_tokenizer, tamil_vocab_size
)


### **Build Seq2Seq Model (Encoder–Decoder)**

In [81]:
latent_dim = 100

# Encoder
encoder_inputs = Input(shape=(max_input_len,))
encoder_embed = Embedding(
    english_vocab_size,
    latent_dim,
    weights=[eng_embedding_matrix],
    trainable=False
)(encoder_inputs)

encoder_outputs, state_h, state_c = LSTM(
    latent_dim, return_state=True
)(encoder_embed)

# Decoder
decoder_inputs = Input(shape=(max_output_len - 1,))
decoder_embed = Embedding(
    tamil_vocab_size,
    latent_dim,
    weights=[tam_embedding_matrix],
    trainable=False
)(decoder_inputs)

decoder_lstm = LSTM(
    latent_dim,
    return_sequences=True,
    return_state=True
)

decoder_outputs, _, _ = decoder_lstm(
    decoder_embed,
    initial_state=[state_h, state_c]
)

decoder_dense = Dense(tamil_vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Training Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


### **Compile & Train Model**

In [84]:
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

history = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_target_sequences,
    batch_size=32,
    epochs=50,
    validation_split=0.2
)


Epoch 1/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 167ms/step - accuracy: 0.2057 - loss: 9.1260 - val_accuracy: 0.2537 - val_loss: 8.3249
Epoch 2/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 142ms/step - accuracy: 0.2458 - loss: 7.7123 - val_accuracy: 0.2534 - val_loss: 7.5384
Epoch 3/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 138ms/step - accuracy: 0.2558 - loss: 6.6025 - val_accuracy: 0.2524 - val_loss: 7.4855
Epoch 4/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 146ms/step - accuracy: 0.2502 - loss: 6.3903 - val_accuracy: 0.2524 - val_loss: 7.5378
Epoch 5/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 143ms/step - accuracy: 0.2542 - loss: 6.2916 - val_accuracy: 0.2542 - val_loss: 7.6371
Epoch 6/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 136ms/step - accuracy: 0.2505 - loss: 6.2707 - val_accuracy: 0.2550 - val_loss: 7.7054
Epoch 7/50
[1m25/25[0m [3

### **Inference Models (REAL TRANSLATION)**

In [87]:
encoder_model = Model(
    encoder_inputs,
    [state_h, state_c]
)
# Decoder Inference model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_embed,
    initial_state=decoder_states_inputs
)

decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs, state_h_dec, state_c_dec]
)


### **Translation Function**

In [104]:
def translate_sentence(sentence):
    seq = english_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_input_len, padding="post")

    state_h, state_c = encoder_model.predict(seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tamil_tokenizer.word_index['<sos>']

    decoded_sentence = []

    for _ in range(max_output_len):
        output_tokens, h, c = decoder_model.predict(
            [target_seq, state_h, state_c]
        )

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tamil_tokenizer.index_word.get(sampled_token_index)

        if sampled_word == '<EOS>' or sampled_word is None:
            break

        decoded_sentence.append(sampled_word)

        target_seq[0, 0] = sampled_token_index
        state_h, state_c = h, c

    return " ".join(decoded_sentence)


In [106]:
list(tamil_tokenizer.word_index.keys())[:20]


['<sos>',
 '<eos>',
 'ஒரு',
 'மற்றும்',
 'என்று',
 'இந்த',
 'அவர்',
 'அமெரிக்க',
 'என்ற',
 'என',
 'நான்',
 'உள்ள',
 'அரசியல்',
 'இருந்து',
 'அந்த',
 'அது',
 'என்',
 'கட்சி',
 'அவர்கள்',
 'மக்கள்']

In [110]:

print(translate_sentence("Technology is changing the world"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
உண்மையிலேயே <eos> <eos>
