In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

In [3]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z' ]", "", text)  # Remove non-alphabetic characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespaces
    return text
def preprocess_sentences(sentences):
    return [clean_text(sentence) for sentence in sentences]

# Load and preprocess data


In [4]:
def load_data(file_path):
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        raw_lines = file.readlines()
    # Filter blank lines and strip text
    raw_lines = [line.strip() for line in raw_lines if line.strip()]
    # Split English and French sentences
    english_sentences = raw_lines[0::3]
    french_sentences = raw_lines[1::3]
    english_sentences = preprocess_sentences(english_sentences)
    french_sentences = preprocess_sentences(french_sentences)
    return english_sentences, french_sentences


In [5]:

# Tokenize text
def tokenize_sentences(sentences, max_vocab_size):
    tokenizer = Tokenizer(num_words=max_vocab_size, filters='')
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    word_index = tokenizer.word_index
    return sequences, word_index, tokenizer

# One-hot encode sentences
def one_hot_encode(sequence, vocab_size):
    return tf.keras.utils.to_categorical(sequence, num_classes=vocab_size)

# Load dataset
path_translation = "./data/opus-2019-12-04.test.txt"
english_sentences, french_sentences = load_data(path_translation)




In [6]:
# Tokenize and preprocess
max_vocab_en = 5000  # Limit vocabulary size
max_vocab_fr = 5000
english_sequences, word_index_en, tokenizer_en = tokenize_sentences(english_sentences, max_vocab_en)
french_sequences, word_index_fr, tokenizer_fr = tokenize_sentences(french_sentences, max_vocab_fr)

# Determine sequence lengths and pad sequences
max_len_en = max(len(seq) for seq in english_sequences)
max_len_fr = max(len(seq) for seq in french_sequences)
max_len_fr = max(max_len_en, max_len_fr)
# Ensure input and target sequences have the same length
english_sequences = pad_sequences(english_sequences, maxlen=max_len_fr, padding='post')
french_sequences = pad_sequences(french_sequences, maxlen=max_len_fr, padding='post')

# Define simple RNN model
class SimpleRNN(tf.keras.Model):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super(SimpleRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = tf.keras.layers.Embedding(input_dim=input_dim, output_dim=hidden_dim)
        self.rnn = tf.keras.layers.SimpleRNN(units=hidden_dim, return_sequences=True)
        self.dense = tf.keras.layers.Dense(output_dim, activation='softmax')

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.rnn(x)
        output = self.dense(x)
        return output

# Hyperparameters
input_dim = len(word_index_en) + 1  # Vocabulary size of English
output_dim = len(word_index_fr) + 1  # Vocabulary size of French
hidden_dim = 128
batch_size = 32
epochs = 10

# Create model
model = SimpleRNN(input_dim, output_dim, hidden_dim)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare data for training
input_data = np.array(english_sequences)
target_data = np.array(french_sequences)  # No need to shift, it's already padded to the same length

# Train model
model.fit(input_data, target_data, batch_size=batch_size, epochs=epochs)

# Testing and prediction
def predict_translation(sentence, tokenizer, model, max_len):
    sequence = tokenizer.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    prediction = model(sequence)
    predicted_sequence = np.argmax(prediction, axis=-1)
    reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}
    return " ".join([reverse_word_index.get(idx, '') for idx in predicted_sequence[0]])

# Example usage
test_sentence = "Hello"
translation = predict_translation(test_sentence, tokenizer_en, model, max_len_fr)
print(f"Translation: {translation}")


Epoch 1/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 413ms/step - accuracy: 0.7910 - loss: 3.7035
Epoch 2/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 415ms/step - accuracy: 0.8511 - loss: 1.0733
Epoch 3/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 406ms/step - accuracy: 0.8572 - loss: 0.9942
Epoch 4/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 412ms/step - accuracy: 0.8638 - loss: 0.9371
Epoch 5/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 417ms/step - accuracy: 0.8655 - loss: 0.9075
Epoch 6/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 411ms/step - accuracy: 0.8681 - loss: 0.8759
Epoch 7/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 387ms/step - accuracy: 0.8715 - loss: 0.8373
Epoch 8/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 423ms/step - accuracy: 0.8748 - loss: 0.7979
Epoch 9/10
[1m1

In [7]:
# Save the trained model to a file
model.save('simple_rnn_translation_model.h5')

