<a href="https://colab.research.google.com/github/harshitkhanna16/Translator-Project/blob/main/Translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import os
import re

# --- Set Project Hyperparameters (FAST SETTINGS) ---

# Size of the "thought vector" (Smaller = faster)
LATENT_DIM = 128

# Use fewer samples (Fewer = faster)
NUM_SAMPLES = 5000

# Training parameters
BATCH_SIZE = 64
EPOCHS = 10  # Reduced from 30 (Fewer = faster)

In [2]:
# --- RUN THIS CELL TO FIX THE ERROR ---

import os

print("Attempting to clear the cached dataset...")
# The -f (force) flag prevents errors if the files don't exist
!rm -f /root/.keras/datasets/spa-eng.zip
!rm -rf /root/.keras/datasets/spa-eng

print("✅ Cache cleared.")
print("Please re-run Cell 2 now.")

Attempting to clear the cached dataset...
✅ Cache cleared.
Please re-run Cell 2 now.


In [3]:
import os
import tensorflow as tf
import zipfile
import urllib.request

# --- Step 1: Define paths and clear cache ---
dataset_dir = "/root/.keras/datasets/"
zip_file_path = os.path.join(dataset_dir, "spa-eng.zip")
extracted_dir_path = os.path.join(dataset_dir, "spa-eng")
file_path = os.path.join(extracted_dir_path, "spa.txt") # This is the file we need

print("Clearing any old, broken files...")
# The -f (force) flag prevents errors if the files don't exist
!rm -f {zip_file_path}
!rm -rf {extracted_dir_path}
os.makedirs(dataset_dir, exist_ok=True) # Ensure the main directory exists
print("Cache cleared.")

# --- Step 2: Manually Download the file ---
url = 'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip'
print(f"Downloading dataset from {url}...")
try:
    urllib.request.urlretrieve(url, zip_file_path)
    print(f"Downloaded zip to: {zip_file_path}")
except Exception as e:
    print(f"An error occurred during download: {e}")

# --- Step 3: Manually extract the file ---
print("Extracting file...")
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(dataset_dir) # Extract to the /datasets/ directory
    print(f"Successfully extracted to: {extracted_dir_path}")
except Exception as e:
    print(f"An error occurred during extraction: {e}")

# --- Step 4: Load the data (The rest of your original cell) ---
# These lists will hold our sentences
input_texts = []    # English sentences (Input)
target_texts = []   # Spanish sentences (Target)

# Read the file
print(f"Reading file from: {file_path}")
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    print(f"Total lines in file: {len(lines)}")

    # Loop through the lines and process them
    # We use the NUM_SAMPLES variable defined in Cell 1
    for line in lines[:min(NUM_SAMPLES, len(lines) - 1)]:
        try:
            # Each line is "English \t Spanish \t Attribution"
            input_text, target_text, _ = line.split('\t')
        except ValueError:
            input_text, target_text = line.split('\t')

        # Add "start" and "end" tokens
        target_text = '\t' + target_text + '\n'

        input_texts.append(input_text)
        target_texts.append(target_text)

    print(f"Loaded {len(input_texts)} sentence pairs.")
    print("--- Example ---")
    print("Input (English):", input_texts[0])
    print("Target (Spanish):", target_texts[0].strip())
    print("\n✅ Cell 2 successfully fixed and executed.")

except FileNotFoundError:
    print("\n--- ❌ ERROR ---")
    print(f"File not found at: {file_path}")
    print("This means the extraction failed. Please check the output above for extraction errors.")
except Exception as e:
    print(f"\nAn error occurred: {e}")

Clearing any old, broken files...
Cache cleared.
Downloading dataset from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip...
Downloaded zip to: /root/.keras/datasets/spa-eng.zip
Extracting file...
Successfully extracted to: /root/.keras/datasets/spa-eng
Reading file from: /root/.keras/datasets/spa-eng/spa.txt
Total lines in file: 118965
Loaded 5000 sentence pairs.
--- Example ---
Input (English): Go.
Target (Spanish): Ve.

✅ Cell 2 successfully fixed and executed.


In [4]:
# --- Tokenize the Input (English) ---
input_tokenizer = Tokenizer(filters='')
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)

# --- Tokenize the Target (Spanish) ---
# Fit the tokenizer BEFORE adding start/end tokens
target_tokenizer = Tokenizer(filters='')
target_tokenizer.fit_on_texts([text.strip() for text in target_texts]) # Fit on stripped text

# Manually add the start and end tokens to the tokenizer's vocabulary
# Check if '\t' and '\n' are already in the vocabulary before adding
if '\t' not in target_tokenizer.word_index:
    target_tokenizer.word_index['\t'] = len(target_tokenizer.word_index) + 1
if '\n' not in target_tokenizer.word_index:
    target_tokenizer.word_index['\n'] = len(target_tokenizer.word_index) + 1

# Now convert texts to sequences AFTER adding tokens
target_sequences = target_tokenizer.texts_to_sequences(target_texts)


# --- Get Vocabulary Sizes ---
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1 # Update size after adding tokens


# --- Get Max Sequence Lengths ---
max_encoder_seq_length = max(len(seq) for seq in input_sequences)
max_decoder_seq_length = max(len(seq) for seq in target_sequences)

print(f"Input Vocab Size: {input_vocab_size}")
print(f"Target Vocab Size: {target_vocab_size}")
print(f"Max English sentence length: {max_encoder_seq_length}")
print(f"Max Spanish sentence length: {max_decoder_seq_length}")

# --- Pad Sequences ---
encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

print("--- Data Shapes ---")
print("Encoder Input Shape:", encoder_input_data.shape)
print("Decoder Input Shape:", decoder_input_data.shape)

Input Vocab Size: 1904
Target Vocab Size: 3778
Max English sentence length: 4
Max Spanish sentence length: 6
--- Data Shapes ---
Encoder Input Shape: (5000, 4)
Decoder Input Shape: (5000, 6)


In [5]:
# --- CRITICAL STEP: "Teacher Forcing" ---
# Create the "target" data, which is the decoder input data shifted by one timestep.

# 1. Create an array of zeros with the same shape
decoder_target_data = np.zeros_like(decoder_input_data)

# 2. Shift the sequences
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]

# 3. Add the final dimension
decoder_target_data = np.expand_dims(decoder_target_data, -1)

print("Decoder Target Shape (shifted):", decoder_target_data.shape)

Decoder Target Shape (shifted): (5000, 6, 1)


In [6]:
# --- 1. Encoder Layers ---
encoder_embedding_layer = Embedding(input_vocab_size, LATENT_DIM, name='encoder_embedding')
encoder_lstm_layer = LSTM(LATENT_DIM, return_state=True, name='encoder_lstm')

# --- 2. Decoder Layers ---
decoder_embedding_layer = Embedding(target_vocab_size, LATENT_DIM, name='decoder_embedding')
decoder_lstm_layer = LSTM(LATENT_DIM, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_dense_layer = Dense(target_vocab_size, activation='softmax', name='decoder_dense')

In [7]:
# --- 1. Define ENCODER ---
encoder_inputs = Input(shape=(None,), name='encoder_input')
encoder_embedding = encoder_embedding_layer(encoder_inputs)
_, state_h, state_c = encoder_lstm_layer(encoder_embedding)
encoder_states = [state_h, state_c]

# --- 2. Define DECODER ---
decoder_inputs = Input(shape=(None,), name='decoder_input')
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_outputs, _, _ = decoder_lstm_layer(decoder_embedding, initial_state=encoder_states)
decoder_outputs = decoder_dense_layer(decoder_outputs)

# --- 3. Build the final model ---
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print("--- Model Built Successfully ---")
model.summary()

--- Model Built Successfully ---


In [8]:
print(f"Starting training for {EPOCHS} epochs...")

# This is where the learning happens!
# It's faster because EPOCHS=10, LATENT_DIM=128, and NUM_SAMPLES=5000
history = model.fit(
    [encoder_input_data, decoder_input_data],  # Our two inputs
    decoder_target_data,                       # Our one output
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2
)

print("--- Training Complete ---")

Starting training for 10 epochs...
Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.9058 - loss: 4.3687 - val_accuracy: 0.9348 - val_loss: 0.6004
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9789 - loss: 0.2060 - val_accuracy: 0.9348 - val_loss: 0.5437
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9778 - loss: 0.1878 - val_accuracy: 0.9348 - val_loss: 0.5118
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9798 - loss: 0.1621 - val_accuracy: 0.9348 - val_loss: 0.4926
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.9796 - loss: 0.1538 - val_accuracy: 0.9348 - val_loss: 0.4795
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.9787 - loss: 0.1541 - val_accuracy: 0.9348 - val_loss: 0.4716

In [9]:
# --- 1. The Encoder Model ---
encoder_model = Model(encoder_inputs, encoder_states)

# --- 2. The Decoder Model ---
decoder_state_input_h = Input(shape=(LATENT_DIM,), name='decoder_state_h')
decoder_state_input_c = Input(shape=(LATENT_DIM,), name='decoder_state_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_inf = decoder_embedding_layer(decoder_inputs)

decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm_layer(
    decoder_embedding_inf, initial_state=decoder_states_inputs
)
decoder_states_inf = [state_h_inf, state_c_inf]

decoder_outputs_inf = decoder_dense_layer(decoder_outputs_inf)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,      # Inputs: word + old states
    [decoder_outputs_inf] + decoder_states_inf     # Outputs: word_prediction + new states
)

print("--- Inference Models Built Successfully ---")

--- Inference Models Built Successfully ---


In [10]:
# --- NEW, FIXED CELL 9 ---

# Create reverse-lookup dictionaries to turn numbers back into words
reverse_input_word_index = {v: k for k, v in input_tokenizer.word_index.items()}
reverse_target_word_index = {v: k for k, v in target_tokenizer.word_index.items()}

# Get the token IDs for our [START] and [END] tokens
start_token_id = target_tokenizer.word_index['\t']
end_token_id = target_tokenizer.word_index['\n']

def translate_sentence(input_seq):
    # 1. ENCODE the input sentence
    states_value = encoder_model.predict(input_seq, verbose=0)

    # 2. START the decoder
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token_id

    decoded_sentence = ''

    stop_condition = False
    while not stop_condition:
        # 3. PREDICT the next word
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        # 4. GET the most likely word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index.get(sampled_token_index, '')

        # 5. --- THIS IS THE FIX ---
        # Stop if we predict [END], the PADDING token (index 0),
        # or the sentence gets too long.
        if (sampled_word == '\n' or
            sampled_token_index == 0 or
            len(decoded_sentence.split()) > max_decoder_seq_length):
            stop_condition = True
        # --- END OF FIX ---
        else:
             decoded_sentence += ' ' + sampled_word

        # 6. UPDATE for the next loop
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()