<a href="https://colab.research.google.com/github/itzmevig/2203A51573-NLP/blob/main/NLP_Assignment_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Natural Language Processing**
# **Assignment - 7**

In [None]:
!pip install tensorflow numpy



# **(a) Data Preprocessing**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Example data (small English to French pairs)
data = [("hello", "bonjour"),
        ("how are you", "comment ça va"),
        ("I am fine", "je vais bien"),
        ("what is your name", "comment tu t'appelles"),
        ("my name is", "je m'appelle"),
        ("thank you", "merci"),
        ("goodbye", "au revoir")]

# Splitting the data into input (English) and output (French) pairs
english_sentences, french_sentences = zip(*data)

# Tokenize English sentences
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
eng_word_index = eng_tokenizer.word_index
max_eng_len = max(len(seq) for seq in eng_sequences)

# Tokenize French sentences
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(french_sentences)
fr_sequences = fr_tokenizer.texts_to_sequences(french_sentences)
fr_word_index = fr_tokenizer.word_index
max_fr_len = max(len(seq) for seq in fr_sequences)

# Padding sequences
eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fr_padded = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')

# Vocabulary sizes
eng_vocab_size = len(eng_word_index) + 1
fr_vocab_size = len(fr_word_index) + 1


# **(b) Build Seq2Seq Model**

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Encoder
latent_dim = 256

encoder_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(eng_vocab_size, latent_dim)(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_fr_len,))
dec_emb_layer = Embedding(fr_vocab_size, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


# **(c) Preparing the Data for Training**

In [None]:
# Preparing decoder target data (shifted French sentences)
fr_padded_target = np.zeros_like(fr_padded)
fr_padded_target[:, :-1] = fr_padded[:, 1:]

# Train/Val Split
train_size = int(0.8 * len(eng_padded))
eng_train, eng_val = eng_padded[:train_size], eng_padded[train_size:]
fr_train, fr_val = fr_padded[:train_size], fr_padded[train_size:]
fr_target_train, fr_target_val = fr_padded_target[:train_size], fr_padded_target[train_size:]

# **(d) Train the model on the dataset**

In [None]:
batch_size = 64
epochs = 100

history = model.fit([eng_train, fr_train], fr_target_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=([eng_val, fr_val], fr_target_val))


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.2000 - loss: 2.6371 - val_accuracy: 0.8333 - val_loss: 2.5655
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - accuracy: 0.6000 - loss: 2.5902 - val_accuracy: 0.8333 - val_loss: 2.4991
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.5333 - loss: 2.5405 - val_accuracy: 0.8333 - val_loss: 2.4193
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.5333 - loss: 2.4825 - val_accuracy: 0.8333 - val_loss: 2.3176
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - accuracy: 0.5333 - loss: 2.4107 - val_accuracy: 0.8333 - val_loss: 2.1843
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - accuracy: 0.5333 - loss: 2.3187 - val_accuracy: 0.8333 - val_loss: 2.0079
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━

# **(e) Inference Setup for Translation**

In [None]:
# Inference Encoder
encoder_model = Model(encoder_inputs, encoder_states)

# Inference Decoder
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs2] + decoder_states2)


# **(f) Translate New Sentences**

In [None]:
# Add 'start' and 'end' tokens to French sentences
french_sentences_with_tokens = ['start ' + sent + ' end' for sent in french_sentences]

# Tokenize French sentences with start and end tokens
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(french_sentences_with_tokens)
fr_sequences = fr_tokenizer.texts_to_sequences(french_sentences_with_tokens)
fr_word_index = fr_tokenizer.word_index
max_fr_len = max(len(seq) for seq in fr_sequences)

# Padding sequences
eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fr_padded = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')


In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    # Start with the 'start' token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = None
        for word, index in fr_word_index.items():
            if index == sampled_token_index:
                sampled_word = word
                break

        if sampled_word == 'end' or len(decoded_sentence) > max_fr_len:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence


In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    # Start with the 'start' token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the index of the most likely next word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Retrieve the corresponding word for the token index
        sampled_word = None
        for word, index in fr_word_index.items():
            if index == sampled_token_index:
                sampled_word = word
                break

        # Handle the case where the word is not found
        if sampled_word is None:
            print(f"Warning: No word found for token index {sampled_token_index}")
            stop_condition = True
        elif sampled_word == 'end' or len(decoded_sentence.split()) > max_fr_len:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '

        # Update the target sequence to the predicted word
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states for the next iteration
        states_value = [h, c]

    return decoded_sentence


In [None]:
# Testing with a new sentence
test_sentence = "hello"
test_sequence = eng_tokenizer.texts_to_sequences([test_sentence])
test_sequence = pad_sequences(test_sequence, maxlen=max_eng_len, padding='post')

translation = decode_sequence(test_sequence)
print(f"Translation: {translation}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
Translation: 


# **(g) Experimenting and Improving the Model by large dataset and hyper tune parameter.**

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.optimizers import Adam

# Define the latent_dim and other hyperparameters
latent_dim = 256
eng_vocab_size = 15  # Example value; replace with actual vocab size
fr_vocab_size = 15   # Example value; replace with actual vocab size
embedding_dim = 128  # Size of the embedding vectors

# Define encoder
encoder_inputs = Input(shape=(None,))  # Only the sequence length is needed here
encoder_embedding = Embedding(input_dim=eng_vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True)(encoder_embedding)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_lstm)
encoder_states = [state_h, state_c]

# Define decoder
decoder_inputs = Input(shape=(None,))  # Only the sequence length is needed here
decoder_embedding = Embedding(input_dim=fr_vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True)(decoder_embedding, initial_state=encoder_states)
decoder_outputs = Dense(fr_vocab_size, activation='softmax')(decoder_lstm)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
opt = Adam(learning_rate=0.0005)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Check the shapes of the training data
# Ensure eng_train, fr_train, and fr_target_train have the appropriate shapes
print(f"eng_train shape: {eng_train.shape}, fr_train shape: {fr_train.shape}, fr_target_train shape: {fr_target_train.shape}")

# Train the model
model.fit([eng_train, fr_train], fr_target_train, batch_size=batch_size, epochs=200, validation_data=([eng_val, fr_val], fr_target_val))


eng_train shape: (5, 4), fr_train shape: (5, 3), fr_target_train shape: (5, 3)
Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.1333 - loss: 2.7057 - val_accuracy: 1.0000 - val_loss: 2.6874
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 701ms/step - accuracy: 0.5333 - loss: 2.6935 - val_accuracy: 0.8333 - val_loss: 2.6733
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step - accuracy: 0.5333 - loss: 2.6808 - val_accuracy: 0.8333 - val_loss: 2.6580
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.5333 - loss: 2.6671 - val_accuracy: 0.8333 - val_loss: 2.6406
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - accuracy: 0.5333 - loss: 2.6519 - val_accuracy: 0.8333 - val_loss: 2.6205
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.5333 - loss: 2.6344 - 

<keras.src.callbacks.history.History at 0x7dda5dd22680>