# **Installing dependencies**

In [None]:
!pip install datasets transformers onnx

Installing collected packages: xxhash, onnx, dill, multiprocess, huggingface-hub, datasets
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.20.3
    Uninstalling huggingface-hub-0.20.3:
      Successfully uninstalled huggingface-hub-0.20.3
Successfully installed datasets-2.19.1 dill-0.3.8 huggingface-hub-0.23.0 multiprocess-0.70.16 onnx-1.16.0 xxhash-3.4.1


# **Importing dependencies**

In [None]:
from transformers import MarianMTModel, MarianTokenizer, MarianConfig, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn.parallel import DataParallel
import torch
from datasets import load_dataset
from tqdm import tqdm

# **Loading Pretrained Model and Tokenizer**

In [None]:
config = MarianConfig(
    vocab_size=30000,
    d_model=512,
    encoder_layers=6,
    decoder_layers=6,
    encoder_attention_heads=8,
    decoder_attention_heads=8,
    encoder_ffn_dim=2048,
    decoder_ffn_dim=2048,
    pad_token_id=0
)

In [None]:
config = MarianConfig.from_pretrained("Helsinki-NLP/opus-mt-en-ru")



In [None]:
model = MarianMTModel(config)
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")



This code i have used to get partitially trained model

In [None]:
from google.colab import drive
from pathlib import Path


drive.mount('/content/drive')
folder_path = "/content/drive/MyDrive/Text2Text"

model_path = Path(folder_path) / "translation_model"
tokenizer_path = Path(folder_path) / "translation_tokenizer"

model = MarianMTModel.from_pretrained(model_path)
tokenizer = MarianTokenizer.from_pretrained(tokenizer_path)

Mounted at /content/drive




# **Loading and Preprocessing the Dataset**

In [None]:
# Load and preprocess the dataset
dataset = load_dataset("Helsinki-NLP/opus-100", "en-ru") # Download dataset from Hugging Face
train_dataset = dataset["train"] # Subdivide it in train..
test_dataset = dataset["test"] # .. Test ..
val_dataset = dataset["validation"] # .. Validation

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/310k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/124M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/310k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Creating the Translation Dataset

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, dataset):
        ''' Initialize dataset '''
        self.dataset = dataset

    def __len__(self):
        ''' Return length of dataset '''
        return len(self.dataset)

    def __getitem__(self, idx):
        '''
         This function will be used
         in training model to get
         batches
         '''
        example = self.dataset[idx] # Get items by indexes
        source_text = example["translation"]["en"] # Source language - english
        target_text = example["translation"]["ru"] # and translation - russian

        # Tokenize and encode the text
        inputs = tokenizer.prepare_seq2seq_batch(
            [source_text], # Tokenize english text
            truncation=True, # Truncation
            padding="max_length", # Pad by longest item
            max_length=256, # Max length of sequence
            return_tensors="pt", # And return PyTorch Tensors
        )
        input_ids = inputs.input_ids.squeeze() # Squeeze text dimension
        attention_mask = inputs.attention_mask.squeeze() # Squeeze Attention-mask
        labels = tokenizer.prepare_seq2seq_batch( # And now make the same with russian texts
            [target_text], # Russian text
            truncation=True, # Truncation
            padding="max_length", # Pad by longest item
            max_length=256, # Max lenght of sequence
            return_tensors="pt", # Return PyTorch tensors
        ).input_ids.squeeze() # Squeese dimension

        return { # And return results
            "input_ids": input_ids, # English text
            "attention_mask": attention_mask, # Attention-mask
            "labels": labels, # Russian text
        }

train_dataset = TranslationDataset(train_dataset) # And here we create TD objects for train..
test_dataset = TranslationDataset(test_dataset) # .. test ..
val_dataset = TranslationDataset(val_dataset) # .. and validation

# **Creating Data Loaders**

In [None]:
# Here we are turning our custom dataset to DataLoader's
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=8) # Train
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=8) # Test
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=8) # Validation

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **Training the Model**

In [None]:
# Training the model
num_epochs = 1 # amount of training epochs
learning_rate = 1e-5 # speed of learning (learning rate)
optimizer = AdamW(model.parameters(), lr=learning_rate) # Create Optimizer for model

model.train() # Switch model to train-mode

model = DataParallel(model).to(device)

for epoch in range(num_epochs): # For epoch..
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch: {epoch+1}/{num_epochs}")

    for batch in progress_bar: # Take each batch from dataset
        optimizer.zero_grad() # Clear gradients of optimizer

        input_ids = batch["input_ids"].to(device) # 'Unzip' our batch
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        # Get model predictions
        loss = outputs.loss # Get losses

        loss.backward() # Step
        optimizer.step() # Back propogation

        total_loss += loss.item()
        progress_bar.set_postfix({"Loss": loss.item(), "Avg Loss": total_loss / (progress_bar.n + 1)})

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}") # Inform us about ended epoch

  self.pid = os.fork()
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, .

KeyboardInterrupt: 

I stopped training process


In [None]:
# Get sentence
sentence = input()

# Tokenize it
input_ids = tokenizer.encode(sentence, return_tensors='pt').to(device)

# Get translation from model
with torch.no_grad():
    output = model.module.generate(input_ids)  

# Decode it.
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print it in console
print(output_text)

Hello, world!
Привет, мир!


# **Evaluating the Model**

In [None]:
# Evaluating the model on the test set
model.eval()
total_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

average_loss = total_loss / len(test_dataloader)
print(f"Test Loss: {average_loss}")

  self.pid = os.fork()
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, .

Test Loss: 0.33141735443472864


# **Saving the Trained Model**

Deprecated. Dont launch it


In [None]:
import torch

# Example
text = "Example for tokenizer."
tokens = tokenizer.tokenize(text)
# Step 2: Encoding
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# Step 3: Adding attention mask
attention_mask = [1] * len(input_ids)  # Example

# Step 4: Converting to tensor
input_ids = torch.tensor([input_ids], dtype=torch.long)
attention_mask = torch.tensor([attention_mask], dtype=torch.long)

# Step 5: Loading to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# Your actual decoder parameters
decoder_input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long)  # Example decoder token identifiers
# or
decoder_inputs_embeds = torch.tensor([[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]], dtype=torch.float)  # Example decoder input embeddings

# Using the model with decoder parameters
outputs = model(
    input_ids=input_ids,
    attention_mask=attention_mask,
    decoder_input_ids=decoder_input_ids,  # or decoder_inputs_embeds
)

# Save in ONNX Format
torch.onnx.export(
    model=model,
    args=(input_ids, attention_mask, decoder_input_ids),  # or args=(input_ids, attention_mask, decoder_inputs_embeds)
    f="model.onnx",  # Path to ONNX-model
    input_names=["input_ids", "attention_mask", "decoder_input_ids"],  # Names of inputs
    output_names=["output"],  # Names of outputs
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},  # Dynamic axis
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "decoder_input_ids": {0: "batch_size", 1: "decoder_length"}  # Adjust the dynamic axis accordingly
    },
    opset_version=12  # Version
)

  if causal_mask.shape[1] < attention_mask.shape[1]:


In [None]:
model.module.save_pretrained("translation_model")
tokenizer.save_pretrained("translation_tokenizer")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


('translation_tokenizer/tokenizer_config.json',
 'translation_tokenizer/special_tokens_map.json',
 'translation_tokenizer/vocab.json',
 'translation_tokenizer/source.spm',
 'translation_tokenizer/target.spm',
 'translation_tokenizer/added_tokens.json')

In [None]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Path to ONNX-model
onnx_model_path = '/content/translation_model'

# Destination path Google Drive
destination_path = '/content/drive/MyDrive/Text2Text2Epoch/translation_model'

# Copy ONNX-model to Google Drive
shutil.copytree(onnx_model_path, destination_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/Text2Text2Epoch/translation_model'

In [None]:

# Path to ONNX-model
onnx_model_path = '/content/translation_tokenizer'

# Destination path Google Drive
destination_path = '/content/drive/MyDrive/Text2Text2Epoch/translation_tokenizer'

# Copy ONNX-model to Google Drive
shutil.copytree(onnx_model_path, destination_path)

'/content/drive/MyDrive/Text2Text2Epoch/translation_tokenizer'

# Total: i have trained MarianMT model for seq2seq translation