In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dot, Activation, Concatenate
from tensorflow.keras.models import Model

In [3]:
english_sentences = [
    "i eat rice",
    "i drink water",
    "you eat food",
    "you drink milk",
    "he goes home",
    "she reads books",
    "we play games",
    "they watch tv",
    "i like tea",
    "she likes coffee",
    "he loves music",
    "we love movies"
]

tamil_sentences = [
    " நான் சாதம் சாப்பிடுகிறேன் ",
    " நான் தண்ணீர் குடிக்கிறேன் ",
    " நீங்கள் உணவு சாப்பிடுகிறீர்கள் ",
    " நீங்கள் பால் குடிக்கிறீர்கள் ",
    " அவன் வீட்டிற்கு செல்கிறான் ",
    " அவள் புத்தகங்கள் படிக்கிறாள் ",
    " நாங்கள் விளையாட்டுகள் விளையாடுகிறோம் ",
    " அவர்கள் டிவி பார்க்கிறார்கள் ",
    " எனக்கு தேநீர் பிடிக்கும் ",
    " அவளுக்கு காபி பிடிக்கும் ",
    " அவனுக்கு இசை பிடிக்கும் ",
    " நாங்கள் திரைப்படங்களை நேசிக்கிறோம் "
]

In [4]:
src_tokenizer = Tokenizer(oov_token="")
src_tokenizer.fit_on_texts(english_sentences)
src_seq = src_tokenizer.texts_to_sequences(english_sentences)

tgt_tokenizer = Tokenizer(filters='', oov_token="")
tgt_tokenizer.fit_on_texts(tamil_sentences)
tgt_seq = tgt_tokenizer.texts_to_sequences(tamil_sentences)

src_vocab = len(src_tokenizer.word_index) + 1
tgt_vocab = len(tgt_tokenizer.word_index) + 1

max_src = max(len(s) for s in src_seq)
max_tgt = max(len(s) for s in tgt_seq)

encoder_input = pad_sequences(src_seq, maxlen=max_src, padding="post")

decoder_input = pad_sequences(
    [s[:-1] for s in tgt_seq],
    maxlen=max_tgt-1,
    padding="post"
)

decoder_target = pad_sequences(
    [s[1:] for s in tgt_seq],
    maxlen=max_tgt-1,
    padding="post"
)

In [5]:
embedding_dim = 32
latent_dim = 64

enc_inputs = Input(shape=(max_src,))
enc_emb = Embedding(src_vocab, embedding_dim)(enc_inputs)
enc_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
enc_out, h, c = enc_lstm(enc_emb)

dec_inputs = Input(shape=(max_tgt-1,))
dec_emb = Embedding(tgt_vocab, embedding_dim)(dec_inputs)
dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
dec_out, _, _ = dec_lstm(dec_emb, initial_state=[h, c])

score = Dot(axes=[2, 2])([dec_out, enc_out])
attention = Activation("softmax")(score)
context = Dot(axes=[2, 1])([attention, enc_out])

combined = Concatenate(axis=-1)([context, dec_out])
output = Dense(tgt_vocab, activation="softmax")(combined)

model = Model([enc_inputs, dec_inputs], output)
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [6]:
model.fit(
    [encoder_input, decoder_input],
    np.expand_dims(decoder_target, -1),
    epochs=300,
    batch_size=2,
    shuffle=False,
    verbose=1
)

Epoch 1/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.0000e+00 - loss: 3.4992
Epoch 2/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.4083 - loss: 3.4839
Epoch 3/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.4202 - loss: 3.4731
Epoch 4/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.4482 - loss: 3.4617
Epoch 5/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4202 - loss: 3.4488
Epoch 6/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2988 - loss: 3.4335 
Epoch 7/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2988 - loss: 3.4149 
Epoch 8/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2988 - loss: 3.3916 
Epoch 9/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x7d97b4e86ab0>

In [7]:
reverse_tgt = {v: k for k, v in tgt_tokenizer.word_index.items()}

def translate(sentence):
    seq = src_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_src, padding="post")

    target = np.zeros((1, max_tgt-1))
    decoded = []

    for i in range(max_tgt-1):
        preds = model.predict([seq, target], verbose=0)
        word_id = np.argmax(preds[0, i])
        word = reverse_tgt.get(word_id, "")

        if word == "":
            break

        decoded.append(word)
        target[0, i] = word_id

    return " ".join(decoded)

In [8]:
print("English:", "i drink water")
print("Tamil  :", translate("i drink water"))

print("English:", "she reads books")
print("Tamil  :", translate("she reads books"))

English: i drink water
Tamil  : தண்ணீர் குடிக்கிறேன்
English: she reads books
Tamil  : புத்தகங்கள் படிக்கிறாள்
