В этом движке ограничение по длине перевода

In [1]:
!pip install torch transformers sentencepiece protobuf sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2

In [2]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

class MultilingualTranslator:
    def __init__(self, model_name="t5-base"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        self.tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(self.device)

    def translate(self, text, source_lang, target_lang):
        """Translate text from source language to target language"""
        # Make sure the source and target languages are supported
        supported_lang = ["English", "French", "German", "Spanish"]
        if source_lang not in supported_lang:
            raise ValueError(f"Unsupported source language: {source_lang}")
        if target_lang not in supported_lang:
            raise ValueError(f"Unsupported target language: {target_lang}")
        # Prepare the input text
        task_prefix = f"translate {source_lang} to {target_lang}"
        input_text = f"{task_prefix}: {text}"
        # Tokenize and generate translation
        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        inputs = inputs.to(self.device)
        outputs = self.model.generate(**inputs, max_length=512, num_beams=4,
                                      length_penalty=0.6, early_stopping=True)
        # Decode and return translation
        translation = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translation

en_text = "Hello, how are you today?"
es_text = "¿Cómo estás hoy?"
translator = MultilingualTranslator("t5-base")

translation = translator.translate(en_text, "English", "French")
print(f"English: {en_text}")
print(f"French: {translation}")
print()

translation = translator.translate(en_text, "English", "German")
print(f"English: {en_text}")
print(f"German: {translation}")
print()

translation = translator.translate(es_text, "Spanish", "English")
print(f"Spanish: {es_text}")
print(f"English: {translation}")

Using device: cpu


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

English: Hello, how are you today?
French: Bonjour, comment vous êtes-vous aujourd'hui?

English: Hello, how are you today?
German: Hallo, wie sind Sie heute?

Spanish: ¿Cómo estás hoy?
English: Cómo estás hoy?


In [5]:
def translate(self, text, source_lang, target_lang):
    """Translate text and report the beam search scores"""
    supported_lang = ["English", "French", "German", "Spanish"]
    if source_lang not in supported_lang:
        raise ValueError(f"Unsupported source language: {source_lang}")
    if target_lang not in supported_lang:
        raise ValueError(f"Unsupported target language: {target_lang}")

    # Prepare the input text
    task_prefix = f"translate {source_lang} to {target_lang}"
    input_text = f"{task_prefix}: {text}"
    # Tokenize and generate translation
    inputs = self.tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    inputs = inputs.to(self.device)
    with torch.no_grad():
        outputs = self.model.generate(**inputs, max_length=512, num_beams=4*4, num_beam_groups=4,
                                      num_return_sequences=4, diversity_penalty=0.8,
                                      length_penalty=0.6, early_stopping=True,
                                      output_scores=True, return_dict_in_generate=True)
    # Decode and return translation
    translation = [self.tokenizer.decode(output, skip_special_tokens=True)
                    for output in outputs.sequences]
    return {
        "translation": translation,
        "score": [float(score) for score in outputs.sequences_scores],
    }

In [7]:
original_text = "This is an important message that needs accurate translation."
translator = MultilingualTranslator("t5-base")
output = translator.translate(original_text, "English", "French")
print(f"English: {original_text}")
print("French:")
for text, score in zip(output["translation"], output["score"]):
    print(f"- (score: {score:.2f}) {text}")

Using device: cpu
English: This is an important message that needs accurate translation.
French: Il s'agit d'un message important qui a besoin d'une traduction précise.


In [8]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

input_text = "translate English to French: " + original_text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

outputs = model.generate(
    input_ids,
    num_return_sequences=3,  # Get 3 possible translations
    num_beams=5,  # Beam search width
    early_stopping=True,
    return_dict_in_generate=True,
    output_scores=True
)

print(f"English: {original_text}")
print("French:")
for i, output in enumerate(outputs.sequences):
    translation = tokenizer.decode(output, skip_special_tokens=True)
    print(f"- Option {i+1}: {translation}")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


English: This is an important message that needs accurate translation.
French:
- Option 1: Il s'agit d'un message important qui a besoin d'une traduc
- Option 2: Il s'agit d'un message important qui doit être traduit avec précision.
- Option 3: Il s'agit d'un message important qui a besoin d'être tradui


In [10]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import numpy as np

# Initialize model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Your input text
original_text = "This is an important message that needs accurate translation."

# Prepare inputs (tokenize the text)
input_text = "translate English to French: " + original_text  # T5 requires translation prefix
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Generate translations with beam search
outputs = model.generate(
    **inputs,
    max_length=512,
    num_beams=4*4,
    num_beam_groups=4,
    num_return_sequences=4,
    diversity_penalty=0.8,
    length_penalty=0.6,
    early_stopping=True,
    output_scores=True,
    return_dict_in_generate=True
)

# Compute transition scores
transition_scores = model.compute_transition_scores(
    outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=True
)

# Print results
print(f"English: {original_text}")
print("French translations:")
for idx, (out_tok, out_score) in enumerate(zip(outputs.sequences, transition_scores)):
    translation = tokenizer.decode(out_tok, skip_special_tokens=True)
    print(f"\nTranslation {idx+1}: {translation}")
    print("Token | Token string   | Logits  | Probability")
    for tok, score in zip(out_tok[1:], out_score):
        print(f"| {tok:5d} | {tokenizer.decode(tok):14s} | {score.numpy():.4f} | {np.exp(score.numpy()):.2%}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


English: This is an important message that needs accurate translation.
French translations:

Translation 1: Il s'agit d'un message important qui a besoin d'une traduction précise.
Token | Token string   | Logits  | Probability
|   802 | Il             | -0.7575 | 46.88%
|     3 |                | -0.0128 | 98.73%
|     7 | s              | -0.0068 | 99.32%
|    31 | '              | -0.3294 | 71.93%
|  5356 | agit           | -0.0033 | 99.67%
|     3 |                | -0.3863 | 67.96%
|    26 | d              | -0.0108 | 98.93%
|    31 | '              | -0.0005 | 99.95%
|   202 | un             | -0.0152 | 98.49%
|  1569 | message        | -0.0295 | 97.09%
|   359 | important      | -0.0227 | 97.75%
|   285 | qui            | -0.4194 | 65.74%
|     3 |                | -0.9924 | 37.07%
|     9 | a              | -0.1236 | 88.38%
|  6350 | besoin         | -0.0114 | 98.87%
|     3 |                | -0.1201 | 88.68%
|    26 | d              | -0.0006 | 99.94%
|    31 | '              

In [12]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import sacrebleu

model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

input_text = "translate English to French: " + sample_document
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)

outputs = model.generate(
    **inputs,
    max_length=512,
    num_beams=5,
    num_return_sequences=3,  # Get 3 different translations
    early_stopping=True,
    output_scores=True,
    return_dict_in_generate=True
)

print(f"English: {sample_document}")
print("French Translations:")
for i, seq in enumerate(outputs.sequences):
    translation = tokenizer.decode(seq, skip_special_tokens=True)
    bleu = sacrebleu.corpus_bleu([translation], [[reference_translation]])
    print(f"\n- Translation {i+1}:")
    print(f"  {translation}")
    print(f"  BLEU Score: {bleu.score:.2f}")

English: 
Machine translation has evolved significantly over the years. Early systems used
rule-based approaches that defined grammatical rules for languages.  Statistical
machine translation later emerged, using large corpora of translated texts to learn
translation patterns automatically.

French Translations:

- Translation 1:
  La traduction automatique a beaucoup évolué au fil des ans. Les premiers systèmes utilisaient des approches fondées sur des règles qui définissaient des règles grammaticales pour les langues. Plus tard, la traduction automatique statistique a vu le jour, en utilisant de vastes corpus de textes traduits pour apprendre automatiquement les schémas de traduction.
  BLEU Score: 47.94

- Translation 2:
  La traduction automatique a beaucoup évolué au fil des ans. Les premiers systèmes utilisaient des approches fondées sur des règles qui définissaient des règles grammaticales pour les langues. Plus tard, la traduction automatique statistique a vu le jour, utilisant