In [1]:
from datasets import load_dataset

ds = load_dataset("loresiensis/corpus-en-es")

Using the latest cached version of the dataset since loresiensis/corpus-en-es couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/bupadhayay/.cache/huggingface/datasets/loresiensis___corpus-en-es/default/0.0.0/d9922426b78ec04e8bafde294da862add0431ec7 (last modified on Sat Feb  1 17:12:53 2025).


In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['EN', 'ES'],
        num_rows: 9439
    })
    test: Dataset({
        features: ['EN', 'ES'],
        num_rows: 1049
    })
})

In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    EncoderDecoderModel,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
)

In [4]:
# 1. Load the dataset.
dataset = load_dataset("loresiensis/corpus-en-es")
# The dataset has two splits: "train" and "test" with fields "EN" and "ES"


Using the latest cached version of the dataset since loresiensis/corpus-en-es couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/bupadhayay/.cache/huggingface/datasets/loresiensis___corpus-en-es/default/0.0.0/d9922426b78ec04e8bafde294da862add0431ec7 (last modified on Sat Feb  1 17:12:53 2025).


In [5]:
tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-small")


In [6]:
max_length = 128 

In [7]:
def preprocess_function(examples):
    # Tokenize the source texts (English)
    inputs = tokenizer(examples["EN"], padding="max_length", truncation=True, max_length=max_length)
    # Tokenize the target texts (Spanish) using as_target_tokenizer context.
    with tokenizer.as_target_tokenizer():
        outputs = tokenizer(examples["ES"], padding="max_length", truncation=True, max_length=max_length)
    
    # Replace all pad token IDs in the labels by -100 so that they are ignored by the loss.
    labels = [
        [token if token != tokenizer.pad_token_id else -100 for token in output]
        for output in outputs["input_ids"]
    ]
    inputs["labels"] = labels
    return inputs

# Apply the preprocessing to the dataset.
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["EN", "ES"])

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9439
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1049
    })
})

In [9]:
# 4. Build the Encoder-Decoder Model.
# Use from_encoder_decoder_pretrained to load BERT small as both encoder and decoder.
encoder_model_name = "prajjwal1/bert-small"
decoder_model_name = "prajjwal1/bert-small"

model = EncoderDecoderModel.from_encoder_decoder_pretrained(encoder_model_name, decoder_model_name)


Some weights of BertLMHeadModel were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'ber

In [10]:
model

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, elem

In [11]:
model.decoder.config.is_decoder = True           # Enable decoder mode.
model.decoder.config.add_cross_attention = True    # Add cross-attention layers.

In [13]:
tokenizer.decode(tokenizer.cls_token_id)

'[CLS]'

In [14]:
tokenizer.decode(tokenizer.sep_token_id)

'[SEP]'

In [15]:
tokenizer.decode(tokenizer.pad_token_id)

'[PAD]'

In [16]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id  # you can adjust this if needed
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

In [17]:
model.config.max_length = max_length
model.config.no_repeat_ngram_size = 3

In [18]:
output_dir = "./bert-small-translation_folder"

In [19]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=8, # decrease the batch ; make it like 2
    per_device_eval_batch_size=8, # decrease the eval batch ; make it like 2
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",  # change to "wandb" or "tensorboard" if desired
)



In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)


In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.9812,3.67286




TrainOutput(global_step=148, training_loss=4.041983011606577, metrics={'train_runtime': 46.5879, 'train_samples_per_second': 202.606, 'train_steps_per_second': 3.177, 'total_flos': 217362705584640.0, 'train_loss': 4.041983011606577, 'epoch': 1.0})

In [None]:
# 9. Save the final model and tokenizer.
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [25]:
import torch

def translate(text, max_length=128, num_beams=5):
    """
    Translate input text from English to Spanish using the trained encoder-decoder model.
    
    Args:
        text (str): The English input text to be translated.
        max_length (int): The maximum length of the generated translation.
        num_beams (int): The number of beams for beam search.
    
    Returns:
        str: The translated Spanish text.
    """
    # Tokenize the input text and move tensors to the model's device.
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    # Explicitly pass the decoder start token ID here.
    generated_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_start_token_id=tokenizer.cls_token_id,  # Force the start token for the decoder.
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
    )

    # Decode the generated tokens.
    translated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return translated_text

# Example usage:
if __name__ == "__main__":
    # Set up the device.
    # IMPORTANT: Explicitly set the following in the model config.
    model.config.decoder_start_token_id = tokenizer.cls_token_id
    model.config.bos_token_id = tokenizer.cls_token_id
    model.config.eos_token_id = tokenizer.sep_token_id
    model.config.pad_token_id = tokenizer.pad_token_id

    # Optionally, also update the decoder's internal config.
    model.decoder.config.is_decoder = True
    model.decoder.config.add_cross_attention = True
    model.decoder.config.decoder_start_token_id = tokenizer.cls_token_id
    model.decoder.config.bos_token_id = tokenizer.cls_token_id

    # Translate an example sentence.
    input_text = "Hello, what is the weather in New York?"
    translation = translate(input_text)
    print("Input:", input_text)
    print("Translation:", translation)


Input: Hello, what is the weather in New York?
Translation: ¿ es especial? es esperamos.. es que que no se ha haberia. que que se se pueden en la comunitaria de la union europea. estamos?? esta? ¿ ¿ ¿? " " ". " "?? es. ¿ ¿. el crea.? ¿ ¿ a a la comuncia. lo lo lo que que esta de la ue. puestro. si si es es es
