# **Training mBART-50 model for Chukchi to Russian translation**

In [None]:
!pip install -U datasets==2.14.4

Collecting datasets==2.14.4
  Downloading datasets-2.14.4-py3-none-any.whl.metadata (19 kB)
Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.6.0
    Uninstalling datasets-3.6.0:
      Successfully uninstalled datasets-3.6.0
Successfully installed datasets-2.14.4


In [None]:
import torch

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MBartForConditionalGeneration
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os

## **Preparing Data**

In [None]:
# Loading data from CSV file
data_path = 'ckt-ru_filtered.csv'
df = pd.read_csv(data_path, delimiter=';')

In [None]:
# Splitting data into training and validation sets (1% for validation)
val_size = int(len(df) * 0.01)
train_df = df[:-val_size]
val_df = df[-val_size:]

In [None]:
# Creating different prompt variations for training data
def create_prompts(data, start_idx, end_idx):
    data_part = data.iloc[start_idx:end_idx]

    prompts = pd.DataFrame()
    prompts['rus_text'] = [
        f'Переведи это предложение с чукотского на русский: "{row["ckt"]}"'
        for _, row in data_part.iterrows()
    ]
    prompts['ckt_text'] = [
        f'Вот перевод на русский: "{row["ru"]}"'
        for _, row in data_part.iterrows()
    ]

    return prompts

In [None]:
# Creating training dataset with different prompt variations
train_parts = []
for i in range(0, len(train_df), len(train_df)//5):
    part = create_prompts(train_df, i, i + len(train_df)//5)
    train_parts.append(part)

In [None]:
train_data = pd.concat(train_parts)
train_dataset = Dataset.from_pandas(train_data)

In [None]:
# Creating validation dataset
val_prompts = create_prompts(val_df, 0, len(val_df))
val_dataset = Dataset.from_pandas(val_prompts)

In [None]:
# Loading model and tokenizer
model_ckpt = "facebook/mbart-large-50"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = MBartForConditionalGeneration.from_pretrained(model_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [None]:
# Function for converting text to tokens
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(
        example_batch["rus_text"],
        max_length=1024,
        padding="max_length",
        truncation=True
    )

    target_encodings = tokenizer(
        example_batch["ckt_text"],
        max_length=1024,
        padding="max_length",
        truncation=True
    )

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }

In [None]:
# Converting data
train_dataset_tf = train_dataset.map(
    convert_examples_to_features,
    batched=True,
    remove_columns=["rus_text", "ckt_text"]
)
val_dataset_tf = val_dataset.map(
    convert_examples_to_features,
    batched=True,
    remove_columns=["rus_text", "ckt_text"]
)

Map:   0%|          | 0/66586 [00:00<?, ? examples/s]

Map:   0%|          | 0/672 [00:00<?, ? examples/s]

In [None]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## **Model Training**

In [None]:
# Setting training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir='mbart_ckt_rus',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy='steps',
    save_strategy='steps',
    eval_steps=500,
    logging_steps=1000,
    weight_decay=0.01,
    push_to_hub=False,
    fp16=True,
    learning_rate=2e-5,
    optim="adafactor",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"
)

In [None]:
# Creating trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=train_dataset_tf,
    eval_dataset=val_dataset_tf
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,No log,0.03937
1000,0.735400,0.036687
1500,0.735400,0.034321
2000,0.044000,0.032933
2500,0.044000,0.032218
3000,0.039500,0.031851
3500,0.039500,0.030652
4000,0.037700,0.030698
4500,0.037700,0.0307
5000,0.038000,0.030217


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=66586, training_loss=0.03981141058969622, metrics={'train_runtime': 67356.9562, 'train_samples_per_second': 0.989, 'train_steps_per_second': 0.989, 'total_flos': 1.443005027277865e+17, 'train_loss': 0.03981141058969622, 'epoch': 1.0})

In [None]:
model_save_path = "mbart_ckt_rus_final"
trainer.save_model(model_save_path)

## **Model Testing**

In [None]:
import torch

test_prompt = 'Переведи это предложение с чукотского на русский: "Аройыръыка қынур арака."'
inputs = tokenizer(test_prompt, return_tensors="pt").input_ids.to("cuda" if torch.cuda.is_available() else "cpu")
outputs = model.generate(inputs, max_length=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Вот перевод на русский: "Без всякой еды."


## **Uploading model to Hugging Face**

In [None]:
from huggingface_hub import notebook_login, HfApi
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

from huggingface_hub import login
login()

# Loading trained model (if needed)
model_path = "mbart_ckt_rus_final"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Setting repository information
repo_name = "mbart50-ckt-rus"
organization = "HSE-Chukchi-NLP"

# Creating repository and uploading model
model.push_to_hub(repo_name, organization=organization)
tokenizer.push_to_hub(repo_name, organization=organization)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/HSE-Chukchi-NLP/mbart50-ckt-rus/commit/682a0018d1f4e8d6cfc74c06aa4513c3bbfde95f', commit_message='Upload tokenizer', commit_description='', oid='682a0018d1f4e8d6cfc74c06aa4513c3bbfde95f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/HSE-Chukchi-NLP/mbart50-ckt-rus', endpoint='https://huggingface.co', repo_type='model', repo_id='HSE-Chukchi-NLP/mbart50-ckt-rus'), pr_revision=None, pr_num=None)