In [1]:
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')


In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


In [3]:
def data_preprocessor(source_sentences,target_sentences):
    source_tokenizer=Tokenizer()
    source_tokenizer.fit_on_texts(source_sentences)  # it creates a vocabulary from the sentences based on frequency and order
    source_sequences=source_tokenizer.texts_to_sequences(source_sentences) # uses the previous-built vocabulary to map unique integer to each word
    source_padded=pad_sequences(source_sequences,padding='post') #to fix the length of all sequence to max. seq. length

    target_sentences=['start '+sentence+' end' for sentence in target_sentences]
    target_tokenizer=Tokenizer()
    target_tokenizer.fit_on_texts(target_sentences)  
    target_sequences=target_tokenizer.texts_to_sequences(target_sentences)
    target_padded=pad_sequences(target_sequences,padding='post')
    return source_padded,target_padded,source_tokenizer,target_tokenizer
    

In [5]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [6]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
dataset=load_dataset("cfilt/iitb-english-hindi")

subset=dataset["train"].shuffle(seed=42).select(range(10000))
english_sentences=[item["translation"]["en"] for item in subset]
hindi_sentences=[item["translation"]["hi"] for item in subset]

In [8]:
encoder_input_data,target_data,source_tokenizer,target_tokenizer=data_preprocessor(english_sentences,hindi_sentences )

In [9]:
target_data

array([[   2, 8212, 5431, ...,    0,    0,    0],
       [   2, 8213,    1, ...,    0,    0,    0],
       [   2,   55,  427, ...,    0,    0,    0],
       ...,
       [   2, 1510, 1445, ...,    0,    0,    0],
       [   2, 1123,  542, ...,    0,    0,    0],
       [   2,  461,    9, ...,    0,    0,    0]], dtype=int32)

In [10]:
encoder_input_data

array([[   13,     1,  7819, ...,     0,     0,     0],
       [ 7820,  7821,     0, ...,     0,     0,     0],
       [   24,    10,    14, ...,     0,     0,     0],
       ...,
       [ 4958,   210, 17651, ...,     0,     0,     0],
       [ 1022,  1010,     0, ...,     0,     0,     0],
       [  196, 17652,  5865, ...,     0,     0,     0]], dtype=int32)

In [11]:
from tensorflow.keras.layers import Input,LSTM,Dense,Embedding,Concatenate
from tensorflow.keras.layers import AdditiveAttention as Attention
from tensorflow.keras.models import Model

embedding_dim=256
latent_dim=512
num_encoder_tokens=len(source_tokenizer.word_index)+1
num_decoder_tokens=len(target_tokenizer.word_index)+1  #both tokenizers have diff. vocabulary

encoder_inputs=Input(shape=(None,),name='encoder_inputs')
encoder_embedding = Embedding(num_encoder_tokens, embedding_dim, name="encoder_embedding")(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="encoder_lstm")# return_sequences=True gives us the full sequence needed for attention and return_state=True gives final hidden and cell state
encoder_outputs,state_h,state_c=encoder_lstm(encoder_embedding)
encoder_states=[state_h,state_c]

decoder_inputs=Input(shape=(None,),name='decoder_inputs')
decoder_embedding=Embedding(num_decoder_tokens,embedding_dim,name='decoder_embedding')
decoder_embedding_output=decoder_embedding(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs,_,_=decoder_lstm(decoder_embedding_output,initial_state=encoder_states) # in decoder while training, all the predictions occur in parallel because instead of previous hidden state as input,actual ground truth is fed into the decodeer as input

attention_layer=Attention(name='attention_layer')
attention_output=attention_layer([decoder_outputs,encoder_outputs])

decoder_concat=Concatenate(axis=-1,name="concat_layer")([decoder_outputs,attention_output])

decoder_dense=Dense(num_decoder_tokens,activation='softmax',name='decoder_dense')
decoder_output_final=decoder_dense(decoder_concat)

# Define the training model that accepts encoder and decoder inputs and produces output probabilities.
training_model=Model([encoder_inputs,decoder_inputs],decoder_output_final)
training_model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])





2025-03-03 09:26:38.735780: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-03-03 09:26:38.736320: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-03-03 09:26:38.736948: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [None]:
from tensorflow.keras.utils import to_categorical
decoder_input_data=target_data[:,:-1]
decoder_target_data=target_data[:,1:]

decoder_target_data_cat=to_categorical(decoder_target_data,num_decoder_tokens)

training_model.fit([encoder_input_data, decoder_input_data],decoder_target_data_cat,batch_size=64,epochs=50,validation_split=0.2)


Epoch 1/50


2025-03-03 09:26:48.627281: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2025-03-03 09:26:48.746238: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-03-03 09:26:48.746838: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-03-03 09:26:48.747384: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG IN

In [None]:
encoder_model_inf=Model(encoder_inputs,[encoder_outputs,state_h,state_c])

In [None]:
decoder_state_input_h=Input(shape=(latent_dim,),name='decoder_state_input_h')
decoder_state_input_c=Input(shape=(latent_dim,),name='decoder_state_input_c')
encoder_outputs_input=Input(shape=(None,latent_dim),name='encoder_outputs_input')

In [None]:
decoder_inf_embeddings = decoder_embedding(decoder_inputs)
decoder_inf_outputs,state_h_inf,state_c_inf=decoder_lstm(decoder_inf_embeddings,initial_state=[decoder_state_input_h,decoder_state_input_c])
attention_inf=attention_layer([decoder_inf_outputs,encoder_outputs_input])
decoder_inf_concat=Concatenate(axis=-1)([decoder_inf_outputs,attention_inf])
decoder_inf_outputs_final=decoder_dense(decoder_inf_concat)

decoder_model_inf=Model([decoder_inputs,encoder_outputs_input,decoder_state_input_h,decoder_state_input_c],[decoder_inf_outputs_final,state_h_inf,state_c_inf])


In [None]:
def translate(input_text):
    
    input_seq = source_tokenizer.texts_to_sequences([input_text])

    input_seq=pad_sequences(input_seq,maxlen=encoder_input_data.shape[1],padding='post')

    enc_outs,state_h, state_c=encoder_model_inf.predict(input_seq)

    start_token_index=target_tokenizer.word_index['start']
    target_seq=np.array([[start_token_index]])

    decoded_sentence=''
    stop_condition=False

    while not stop_condition:
        output_tokens,state_h,state_c=decoder_model_inf.predict([target_seq,enc_outs,state_h, state_c])
        sampled_token_index=np.argmax(output_tokens[0,-1,:])    #output_tokens[0, -1, :] means:
                                                                # 0: We select the first (and only) sample in the batch.
                                                                # -1: We select the output from the last time step (which corresponds to the most recent prediction).
                                                                # :: We take all elements along the last dimension, which represents the probabilities for each token in the target vocabulary.
        sampled_word=target_tokenizer.index_word.get(sampled_token_index,'')

        if(sampled_word=='end' or len(decoded_sentence.split()) >50):
            stop_condition=True
        else:
            decoded_sentence+=' '+sampled_word

        target_seq=np.array([[sampled_token_index]])

    return decoded_sentence.strip()

In [None]:
translated_sentence=translate("hello")

In [None]:
print(translated_sentence)

In [None]:
translated_sentence=translate("have a good day?")

In [None]:
print(translated_sentence)