In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

# Load and preprocess the dataset
train = pd.read_csv("engtamilTrain.csv").drop(["Unnamed: 0"], axis=1)
english_sentences = train["en"].head(1000)
tamil_sentences = train["ta"].head(1000)

# Function to add <SOS> and <EOS> tokens to each sentence
def addSosEos(seriesSentence):
    sos_token, eos_token = "<SOS>", "<EOS>"
    return [f"{sos_token} {sentence} {eos_token}" for sentence in seriesSentence]

# Apply the function to add tokens
english_sent_SE = addSosEos(english_sentences)
tamil_sent_SE = addSosEos(tamil_sentences)

# Tokenize the English and Tamil sentences
english_tokenizer = Tokenizer(filters="")
english_tokenizer.fit_on_texts(english_sent_SE)
english_vocab_size = len(english_tokenizer.word_index) + 1
english_sequences = english_tokenizer.texts_to_sequences(english_sent_SE)

tamil_tokenizer = Tokenizer(filters="")
tamil_tokenizer.fit_on_texts(tamil_sent_SE)
tamil_vocab_size = len(tamil_tokenizer.word_index) + 1
tamil_sequences = tamil_tokenizer.texts_to_sequences(tamil_sent_SE)

# Define maximum sequence lengths
max_input_seq_length = 20
max_output_seq_length = 20

# Pad sequences to a fixed length
input_sequences = pad_sequences(english_sequences, maxlen=max_input_seq_length, padding='post')
output_sequences = pad_sequences(tamil_sequences, maxlen=max_output_seq_length, padding='post')

# Prepare decoder input and output sequences for training
decoder_input_sequences = np.zeros_like(output_sequences)
decoder_input_sequences[:, 1:] = output_sequences[:, :-1]
decoder_input_sequences[:, 0] = tamil_tokenizer.word_index.get('<SOS>', 0)

decoder_output_sequences = tf.keras.utils.to_categorical(output_sequences, num_classes=tamil_vocab_size)

# Load pre-trained Word2Vec models
eng_model = Word2Vec.load('engmodel.bin')
tam_model = Word2Vec.load('tammodel.bin')

# Function to create embedding matrix from Word2Vec model
def create_embedding_matrix(word2vec_model, tokenizer, vocab_size):
    embedding_matrix = np.zeros((vocab_size, word2vec_model.vector_size))
    for word, i in tokenizer.word_index.items():
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]
    return embedding_matrix

# Create embedding matrices for both languages
eng_embedding_matrix = create_embedding_matrix(eng_model, english_tokenizer, english_vocab_size)
tam_embedding_matrix = create_embedding_matrix(tam_model, tamil_tokenizer, tamil_vocab_size)

# Function to create the Seq2Seq model
def create_seq2seq_model(input_vocab_size, output_vocab_size, input_seq_length, output_seq_length, hidden_units, eng_embedding_matrix, tam_embedding_matrix):
    # Encoder
    encoder_inputs = Input(shape=(input_seq_length,))
    encoder_embedding = Embedding(input_vocab_size, hidden_units, weights=[eng_embedding_matrix], trainable=False)(encoder_inputs)
    encoder_lstm, encoder_state_h, encoder_state_c = LSTM(hidden_units, return_state=True)(encoder_embedding)

    # Decoder
    decoder_inputs = Input(shape=(output_seq_length,))
    decoder_embedding = Embedding(output_vocab_size, hidden_units, weights=[tam_embedding_matrix], trainable=False)(decoder_inputs)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

# Create and compile the Seq2Seq model
model = create_seq2seq_model(english_vocab_size, tamil_vocab_size, max_input_seq_length, max_output_seq_length, 100, eng_embedding_matrix, tam_embedding_matrix)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32
epochs = 500
model.fit([input_sequences, output_sequences], decoder_output_sequences, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Preprocessing and predicting
input_sentence = "<SOS> Finally, the columnist fails to tell us who among the political leaders of the bourgeoisie, past and present, he counts among the paragons of morality <EOS>"

# Convert the input sentence to sequence and pad it
input_sequence = english_tokenizer.texts_to_sequences([input_sentence])
input_sequence = pad_sequences(input_sequence, maxlen=max_input_seq_length, padding='post')

# Generate predictions
predictions = model.predict([input_sequence, np.zeros((1, max_output_seq_length))])

# Convert predictions to tokens and decode
predicted_tokens = np.argmax(predictions, axis=-1)[0]
tamil_index_word = {i: w for w, i in tamil_tokenizer.word_index.items()}
decoded_sentence = [tamil_index_word.get(token, '<unk>') for token in predicted_tokens if token != 0 and token != tamil_tokenizer.word_index.get('<EOS>', 0)]

# Join the words to form the decoded statement
decoded_statement = ' '.join(decoded_sentence)

# Print the decoded statement
print(decoded_statement)

# Print additional details for debugging
print("Predictions:", predictions)
print("Predicted tokens:", predicted_tokens)
print("Input sequence:", input_sequence)


Epoch 1/500
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 117ms/step - accuracy: 0.2352 - loss: 9.1269 - val_accuracy: 0.2722 - val_loss: 8.3259
Epoch 2/500
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 112ms/step - accuracy: 0.2716 - loss: 7.7006 - val_accuracy: 0.2695 - val_loss: 7.4607
Epoch 3/500
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 116ms/step - accuracy: 0.2828 - loss: 6.4871 - val_accuracy: 0.2722 - val_loss: 7.4995
Epoch 4/500
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 115ms/step - accuracy: 0.2712 - loss: 6.3620 - val_accuracy: 0.2722 - val_loss: 7.5577
Epoch 5/500
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 112ms/step - accuracy: 0.2898 - loss: 6.1533 - val_accuracy: 0.2722 - val_loss: 7.5752
Epoch 6/500
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 113ms/step - accuracy: 0.2682 - loss: 6.2509 - val_accuracy: 0.2722 - val_loss: 7.5409
Epoch 7/500
[1m25/25