In [None]:
!pip install datasets



In [None]:
# from datasets import load_dataset

# # Available: ['ca-de', 'ca-en', 'ca-hu', 'ca-nl', 'de-en', 'de-eo', 'de-es', 'de-fr',
# # 'de-hu', 'de-it', 'de-nl', 'de-pt', 'de-ru', 'el-en', 'el-es', 'el-fr', 'el-hu',
# # 'en-eo', 'en-es', 'en-fi', 'en-fr', 'en-hu', 'en-it', 'en-nl', 'en-no', 'en-pl',
# # 'en-pt', 'en-ru', 'en-sv', 'eo-es', 'eo-fr', 'eo-hu', 'eo-it', 'eo-pt', 'es-fi',
# # 'es-fr', 'es-hu', 'es-it', 'es-nl', 'es-no', 'es-pt', 'es-ru', 'fi-fr', 'fi-hu',
# # 'fi-no', 'fi-pl', 'fr-hu', 'fr-it', 'fr-nl', 'fr-no', 'fr-pl', 'fr-pt', 'fr-ru',
# # 'fr-sv', 'hu-it', 'hu-nl', 'hu-no', 'hu-pl', 'hu-pt', 'hu-ru', 'it-nl', 'it-pt',
# # 'it-ru', 'it-sv']

# ds = load_dataset("opus_books", "de-en")
# pairs = [(ex["translation"]["en"].lower().strip(), ex["translation"]["de"].lower().strip()) for ex in ds["train"]]

# print("Loaded pairs:", len(pairs))
# for x in pairs[:10]:
#   print(x[0], "\n", x[1], "\n\n")


In [None]:
# ds = load_dataset("wmt14", "de-en", split="train[:5%]")  # small for teaching

# pairs = [(ex["translation"]["en"], ex["translation"]["de"]) for ex in ds]

# print(len(pairs), pairs[:5])

# Download English–German Parallel Corpus

In [None]:
!wget https://www.manythings.org/anki/deu-eng.zip && unzip -o deu-eng.zip


--2025-11-26 10:58:15--  https://www.manythings.org/anki/deu-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11638759 (11M) [application/zip]
Saving to: ‘deu-eng.zip’


2025-11-26 10:58:18 (5.38 MB/s) - ‘deu-eng.zip’ saved [11638759/11638759]

Archive:  deu-eng.zip
  inflating: deu.txt                 
  inflating: _about.txt              


In [None]:
#@title: Load Translation Pairs
pairs = []
with open("deu.txt", "r", encoding="utf-8") as f:
    for line in f.readlines():
        parts = line.strip().split("\t")
        if len(parts) < 2:
            continue
        eng, deu = parts[0].lower(), parts[1].lower()
        pairs.append((eng, deu))

print("Total sentence pairs:", len(pairs))
print(pairs[:5])


Total sentence pairs: 324282
[('go.', 'geh.'), ('hi.', 'hallo!'), ('hi.', 'grüß gott!'), ('run!', 'lauf!'), ('run.', 'lauf!')]


In [None]:
len(pairs)

324282

# Prepare Data

Add BOS/EOS markers for the decoder:

In [None]:
eng_texts = [p[0] for p in pairs]
deu_texts = [p[1] for p in pairs]

deu_texts_in = ["<bos> " + t for t in deu_texts]
deu_texts_out = [t + " <eos>" for t in deu_texts]

# Tokenization + Padding

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizers
eng_tok = Tokenizer(filters='')
deu_tok = Tokenizer(filters='')

eng_tok.fit_on_texts(eng_texts)
deu_tok.fit_on_texts(deu_texts_in + deu_texts_out)

eng_seqs = eng_tok.texts_to_sequences(eng_texts)
deu_seqs_in = deu_tok.texts_to_sequences(deu_texts_in)
deu_seqs_out = deu_tok.texts_to_sequences(deu_texts_out)

eng_vocab = len(eng_tok.word_index) + 1
deu_vocab = len(deu_tok.word_index) + 1

max_eng = max(len(s) for s in eng_seqs)
max_deu = max(len(s) for s in deu_seqs_out)

X = pad_sequences(eng_seqs, maxlen=max_eng, padding="post")
decoder_in = pad_sequences(deu_seqs_in, maxlen=max_deu, padding="post")
decoder_out = pad_sequences(deu_seqs_out, maxlen=max_deu, padding="post")

X.shape, decoder_in.shape, decoder_out.shape


((324282, 101), (324282, 76), (324282, 76))

# Build Encoder–Decoder with Sequential Models

We use:

- Bidirectional encoder (better translations)
- Standard LSTM decoder
- Dense softmax output

In [None]:
#@title: Encoder
import tensorflow as tf

latent_dim = 256

encoder = tf.keras.Sequential([
    tf.keras.layers.Embedding(eng_vocab, latent_dim),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(latent_dim, return_state=True)
    )
], name="encoder")


In [None]:
#@title: Decoder
decoder = tf.keras.Sequential([
    tf.keras.layers.Embedding(deu_vocab, latent_dim*2),
    tf.keras.layers.LSTM(latent_dim*2, return_sequences=True, return_state=True),
    tf.keras.layers.Dense(deu_vocab, activation="softmax")
], name="decoder")

In [None]:
#@title: Connect Encoder + Decoder (Functional)
# Inputs
encoder_inputs = tf.keras.Input(shape=(max_eng,))
decoder_inputs = tf.keras.Input(shape=(max_deu,))

# Encoder
_, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])

encoder_states = [state_h, state_c]

# Decoder
dec_emb = decoder.layers[0](decoder_inputs)
decoder_lstm = decoder.layers[1]
decoder_dense = decoder.layers[2]

dec_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
dec_outputs = decoder_dense(dec_outputs)

model = tf.keras.Model([encoder_inputs, decoder_inputs], dec_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

model.summary()


In [None]:
history = model.fit(
    [X, decoder_in],
    decoder_out[..., None],
    batch_size=16,
    epochs=20,
    validation_split=0.1
)


Epoch 1/20
[1m18241/18241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2330s[0m 127ms/step - loss: 0.4275 - val_loss: 0.6429
Epoch 2/20
[1m 6458/18241[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m24:25[0m 124ms/step - loss: 0.1714

In [None]:
encoder_model = tf.keras.Model(
    encoder_inputs,
    [state_h, state_c]
)