In [17]:
# 2025/7/3
# zhangzhong

In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("wmt16", "ro-en")
model_name = "bigscience/mt0-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [20]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 610320
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1999
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1999
    })
})

In [3]:
# Show first training example
dataset['train'][0]

# The dataset contains 610,320 training examples, 1,999 validation examples, and 1,999 test examples of Romanian-English translation pairs. Each example has the structure:
# Each example has the structure:
# {
#     'translation': {
#         'ro': 'Romanian text here',
#         'en': 'English text here'
#     }
# }


{'translation': {'en': 'Membership of Parliament: see Minutes',
  'ro': 'Componenţa Parlamentului: a se vedea procesul-verbal'}}

In [7]:
inputs = [ex["en"] for ex in dataset["train"]["translation"][:10]]
print(len(inputs))
print(inputs)

10
['Membership of Parliament: see Minutes', 'Approval of Minutes of previous sitting: see Minutes', 'Membership of Parliament: see Minutes', 'Verification of credentials: see Minutes', 'Documents received: see Minutes', 'Written statements and oral questions (tabling): see Minutes', 'Petitions: see Minutes', 'Texts of agreements forwarded by the Council: see Minutes', "Action taken on Parliament's resolutions: see Minutes", 'Agenda for next sitting: see Minutes']


In [22]:
# Show first 5 training examples
for i in range(5):
    example = dataset['train'][i]
    print(f"Example {i+1}:")
    print(f"Romanian: {example['translation']['ro']}")
    print(f"English: {example['translation']['en']}")
    print("-" * 50)

Example 1:
Romanian: Componenţa Parlamentului: a se vedea procesul-verbal
English: Membership of Parliament: see Minutes
--------------------------------------------------
Example 2:
Romanian: Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal
English: Approval of Minutes of previous sitting: see Minutes
--------------------------------------------------
Example 3:
Romanian: Componenţa Parlamentului: a se vedea procesul-verbal
English: Membership of Parliament: see Minutes
--------------------------------------------------
Example 4:
Romanian: Verificarea prerogativelor: a se vedea procesul-verbal
English: Verification of credentials: see Minutes
--------------------------------------------------
Example 5:
Romanian: Depunere de documente: a se vedea procesul-verbal
English: Documents received: see Minutes
--------------------------------------------------


In [23]:
# Show examples from validation set
print("Validation examples:")
for i in range(3):
    example = dataset['validation'][i]
    print(f"Val Example {i+1}:")
    print(f"Romanian: {example['translation']['ro']}")
    print(f"English: {example['translation']['en']}")
    print("-" * 50)

Validation examples:
Val Example 1:
Romanian: Fostul șef al cabinetului prezidențial brazilian este adus în fața instanței
English: Brazil's Former Presidential Chief-of-Staff to Stand Trial
--------------------------------------------------
Val Example 2:
Romanian: Marți, un judecător federal a acceptat acuzațiile aduse împotriva fostului șef al cabinetului prezidențial brazilian pentru presupusa implicare a acestuia într-o schemă masivă de corupție privind compania petrolieră de stat Petrobras.
English: A federal judge on Tuesday accepted the charges filed against Brazil's former presidential chief of staff for his alleged involvement in a massive corruption scheme at state-owned oil company Petrobras.
--------------------------------------------------
Val Example 3:
Romanian: Biroul procurorului federal a declarat că Jose Dirceu va fi trimis în judecată pentru acuzațiile de corupție, înșelătorie și spălare de bani aduse în această lună.
English: The federal prosecutor's office said 

In [24]:
# Show random examples from training set
import random

print("Random training examples:")
train_size = len(dataset['train'])
random_indices = random.sample(range(train_size), 3)

for i, idx in enumerate(random_indices):
    example = dataset['train'][idx]
    print(f"Random Example {i+1} (index {idx}):")
    print(f"Romanian: {example['translation']['ro']}")
    print(f"English: {example['translation']['en']}")
    print("-" * 50)

Random training examples:
Random Example 1 (index 506517):
Romanian: Între 4 şi 6 ianuarie, Croaţia a găzduit slalomul premiat cu Trofeul Regina Zăpezii.
English: Croatia hosted the Snow Queen Trophy Slalom from January 4th to 6th.
--------------------------------------------------
Random Example 2 (index 40231):
Romanian: Transparenţa trebuie să se aplice la fiecare nivel administrativ în ceea ce priveşte documentele.
English: Transparency must apply at each administrative level with regard to documents.
--------------------------------------------------
Random Example 3 (index 381104):
Romanian: Avem două opţiuni.
English: We have two options.
--------------------------------------------------


In [25]:
# Dataset statistics
print("Dataset Statistics:")
print(f"Training examples: {len(dataset['train']):,}")
print(f"Validation examples: {len(dataset['validation']):,}")
print(f"Test examples: {len(dataset['test']):,}")
print(f"Total examples: {len(dataset['train']) + len(dataset['validation']) + len(dataset['test']):,}")

# Sample some examples to check sentence lengths
sample_examples = [dataset['train'][i] for i in range(10)]
ro_lengths = [len(ex['translation']['ro'].split()) for ex in sample_examples]
en_lengths = [len(ex['translation']['en'].split()) for ex in sample_examples]

print(f"\nSample sentence lengths (first 10 examples):")
print(f"Romanian avg length: {sum(ro_lengths)/len(ro_lengths):.1f} words")
print(f"English avg length: {sum(en_lengths)/len(en_lengths):.1f} words")

Dataset Statistics:
Training examples: 610,320
Validation examples: 1,999
Test examples: 1,999
Total examples: 614,318

Sample sentence lengths (first 10 examples):
Romanian avg length: 7.6 words
English avg length: 6.0 words


In [26]:
# we need to preprocess the dataset and tokenize the dataset to be trained

def preprocess_function(examples):
    inputs = [ex["translation"]["en"] for ex in examples]
    targets = [ex["translation"]["ro"] for ex in examples]

    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding=False,  # Dynamic padding to longest in batch (set to False for no padding during preprocessing
    )

    # tokenizer.as_target_tokenizer() is crucial for seq2seq models because:
    # Some tokenizers behave differently for source vs target text
    # For T5/mT5 models, it ensures proper handling of decoder inputs
    # It may add special tokens or handle BOS/EOS tokens differently
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding=False,  # Dynamic padding to longest in batch
        )

    return model_inputs, labels


In [None]:
# tokenizer(inputs) 返回的结构
# model_inputs = {
#     "input_ids": [
#         [259, 899, 1, 0, 0, ...],  # "Hello world" 的 token IDs (填充到128长度)
#         [876, 432, 1, 0, 0, ...]   # "Good morning" 的 token IDs
#     ],
#     "attention_mask": [
#         [1, 1, 1, 0, 0, ...],      # 实际内容为1，填充为0
#         [1, 1, 1, 0, 0, ...]
#     ]
# }

# # tokenizer(targets) 返回的结构
# labels = {
#     "input_ids": [
#         [385, 765, 1, 0, 0, ...],  # "Salut lume" 的 token IDs
#         [654, 231, 1, 0, 0, ...]   # "Bună dimineața" 的 token IDs
#     ],
#     "attention_mask": [
#         [1, 1, 1, 0, 0, ...],
#         [1, 1, 1, 0, 0, ...]
#     ]
# }
#
# 真的神奇，只有input_ids才能访问。。。

In [27]:
model_inputs, labels = preprocess_function(dataset['train'])

print("hello")



hello


In [None]:
dataset["train"].column_names



['translation']

In [None]:


tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    # The dataset would have BOTH old and new columns:
    # {
    #     'translation': {  # Original column still exists
    #         'ro': 'Rezoluția se referă la o problemă importantă.',
    #         'en': 'The resolution refers to an important problem.'
    #     },
    #     'input_ids': [259, 899, 1, ...],      # New tokenized data
    #     'attention_mask': [1, 1, 1, ...],     # New tokenized data
    #     'labels': [385, 765, 1, ...]          # New tokenized data
    # }
    # 如果不去掉，最终处理出来的数据集就会有上面的 translation 这个东西
    # If you want to remove the original 'translation' column, set remove_columns
    # Only the new tokenized columns remain:
    # {
    #     'input_ids': [259, 899, 1, ...],      # Tokenized input
    #     'attention_mask': [1, 1, 1, ...],     # Attention mask
    #     'labels': [385, 765, 1, ...]          # Tokenized targets
    # }
    remove_columns=dataset["train"].column_names, # 
    desc="Running tokenizer on dataset",
    load_from_cache_file=True,
)