In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
cd gdrive/My Drive/dataset

/content/gdrive/My Drive/dataset


In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [4]:
data1 = pd.read_csv("source_test.txt",sep='   ',header=None,names=['english_sentence'])
data1.shape

  data1 = pd.read_csv("source_test.txt",sep='   ',header=None,names=['english_sentence'])


(2507, 1)

In [5]:
# Load the data
# data = pd.read_csv('eng_hin_data.csv')
data1 = pd.read_csv("source_test.txt",sep='   ',header=None,names=['english_sentence'])
data2 = pd.read_csv("target_test.txt",sep='    ',header=None,names=['hindi_sentence'])
# print(data)  
# Separate the English and Hindi sentences
eng_sentences = data1['english_sentence'][:1500]
hin_sentences = data2['hindi_sentence'][:1500]

# Tokenize the English and Hindi sentences
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(eng_sentences)
hin_tokenizer = Tokenizer()
hin_tokenizer.fit_on_texts(hin_sentences)

# Vocabulary sizes
eng_vocab_size = len(eng_tokenizer.word_index) + 1
hin_vocab_size = len(hin_tokenizer.word_index) + 1

# Maximum sentence lengths
eng_max_len = max([len(sent.split()) for sent in eng_sentences])
hin_max_len = max([len(sent.split()) for sent in hin_sentences])

# Encode the English sentences
eng_seq = eng_tokenizer.texts_to_sequences(eng_sentences)
eng_seq = pad_sequences(eng_seq, maxlen=eng_max_len, padding='post')

# Encode the Hindi sentences
hin_seq = hin_tokenizer.texts_to_sequences(hin_sentences)
hin_seq = pad_sequences(hin_seq, maxlen=hin_max_len, padding='post')

# One-hot encode the Hindi sentences
hin_onehot = to_categorical(hin_seq, num_classes=hin_vocab_size)


  data1 = pd.read_csv("source_test.txt",sep='   ',header=None,names=['english_sentence'])
  data2 = pd.read_csv("target_test.txt",sep='    ',header=None,names=['hindi_sentence'])


In [6]:
data1

Unnamed: 0,english_sentence
0,A black box in your car?
1,As America's road planners struggle to find th...
2,"The devices, which track every mile a motorist..."
3,The usually dull arena of highway planning has...
4,Libertarians have joined environmental groups ...
...,...
2502,It is noteworthy that both Nita and Isha are p...
2503,250 VIPs have been invited to this royal party.
2504,These include the Jodhpur royal family and Uma...
2505,"L.N. Mittal, Sachin Tendulkar, and Bollywood a..."


In [7]:
# Encoder
encoder_input = Input(shape=(eng_max_len,))
encoder_embedding = Embedding(eng_vocab_size, 256, mask_zero=True)(encoder_input)
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_output, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_input = Input(shape=(None,))
decoder_embedding = Embedding(hin_vocab_size, 256)(decoder_input)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Attention
attention = Attention()([decoder_output, encoder_output])

# Concatenate the attention output and decoder output
decoder_concat = Concatenate(axis=-1)([decoder_output, attention])

# Output
decoder_dense = Dense(hin_vocab_size, activation='softmax')
output = decoder_dense(decoder_concat)

# Model
model = Model([encoder_input, decoder_input], output)


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

# Train the model
results=model.fit([eng_seq, hin_seq[:, :-1]], hin_onehot[:, 1:], epochs=100, batch_size=64)


In [None]:

%matplotlib inline
import matplotlib.pyplot as plt
acc = results.history['acc']
# val_acc = results.history['val_accuracy']
loss = results.history['loss']
# val_loss = results.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'g', label='Training accuracy')
# plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training accuracy')
plt.legend(loc=0)
plt.savefig('wordA.png')
plt.figure()
plt.show()

plt.plot(epochs, loss, 'r', label='Training loss')
# plt.plot(epochs, val_loss, 'o', label='Validation loss')
plt.title('Training loss')
plt.legend(loc=0)
plt.savefig('wordL.png')
plt.figure()
plt.show()

In [None]:
# Add the 'start' token to the Hindi tokenizer's word index
hin_tokenizer.word_index['start'] = hin_vocab_size + 1
hin_tokenizer.word_index = {k: v + 1 for k, v in hin_tokenizer.word_index.items()}
hin_tokenizer.word_index[''] = 0
hin_tokenizer.word_index['start'] = hin_vocab_size + 1

# Update the Hindi vocabulary size
hin_vocab_size = len(hin_tokenizer.word_index) + 1

# Add the empty string token to the Hindi tokenizer's word index
hin_tokenizer.word_index[''] = 0
hin_tokenizer.word_index = {k: v + 1 for k, v in hin_tokenizer.word_index.items()}

# Update the Hindi vocabulary size
hin_vocab_size = len(hin_tokenizer.word_index) + 1


In [None]:
# Add the empty string token to the Hindi tokenizer's word index
hin_tokenizer.word_index[''] = 0
hin_tokenizer.word_index = {k: v + 1 for k, v in hin_tokenizer.word_index.items()}

# Update the Hindi vocabulary size
hin_vocab_size = len(hin_tokenizer.word_index) + 1

def generate_translation(input_sentence):
    # Convert the input sentence to a sequence of integers
    input_sequence = eng_tokenizer.texts_to_sequences([input_sentence])
    input_sequence = pad_sequences(input_sequence, maxlen=eng_max_len, padding='post')

    # Initialize the target sequence with the 'start' token
    target_sequence = np.zeros((1, hin_max_len))
    target_sequence[0, 0] = hin_tokenizer.word_index['start']

    # Generate the translation one token at a time
    for i in range(1, hin_max_len):
        # Predict the next token in the target sequence
        prediction = model.predict([input_sequence, target_sequence]).argmax(axis=2)
        target_sequence[0, i] = prediction[0, i-1]

        # If the predicted token is the 'end' token, end the translation
        hin_word = hin_tokenizer.index_word.get(target_sequence[0, i])
        if hin_word is None or hin_word == 'end':
            break

    # Convert the target sequence to a sequence of words
    hin_words = []
    for i in range(1, hin_max_len):
        hin_word = hin_tokenizer.index_word.get(target_sequence[0, i], None)
        if hin_word is None or hin_word == 'end':
            break
        hin_words.append(hin_word)

    # Convert the sequence of words to a string and return it
    return ' '.join(hin_words)




In [None]:



# Test the model on some sample English sentences
test_sentences = [
    'I love eating pizza.',
    'The cat is sleeping on the sofa.',
    'What time is it?',
    'I have to go now.',
    'This is a nice surprise!',
]

for sentence in test_sentences:
    translation = generate_translation(sentence)
    print(f'{sentence} => {translation}')