# **Training mBART-50 model for Russian to Chukchi translation**

In [None]:
!pip install -U datasets==2.14.4



In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla T4


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MBartForConditionalGeneration
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback
import torch
import os
from google.colab import drive
import shutil
from transformers import TrainerCallbac

In [None]:
from transformers import set_seed
set_seed(42)

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DRIVE_PATH = "/content/drive/MyDrive/mbart_rus_ckt_checkpoints"
LOCAL_PATH = "/content/mbart_rus_ckt_checkpoints"

os.makedirs(DRIVE_PATH, exist_ok=True)
os.makedirs(LOCAL_PATH, exist_ok=True)

## **Preparing Data**

In [None]:
# Loading data from CSV file
data_path = '/content/ckt-ru_filtered.csv'
df = pd.read_csv(data_path, delimiter=';')

In [None]:
# Splitting data into training and validation sets (1% for validation)
val_size = int(len(df) * 0.01)
train_df = df[:-val_size]
val_df = df[-val_size:]

In [None]:
# Creating different prompt variations for training data
def create_prompts(data, start_idx, end_idx):
    data_part = data.iloc[start_idx:end_idx]

    prompts = pd.DataFrame()
    prompts['rus_text'] = [
        f'Переведи это предложение с русского на чукотский: "{row["ru"]}"'
        for _, row in data_part.iterrows()
    ]
    prompts['ckt_text'] = [
        f'Вот перевод на чукотский: "{row["ckt"]}"'
        for _, row in data_part.iterrows()
    ]

    return prompts

In [None]:
# Creating training dataset with different prompt variations
train_parts = []
for i in range(0, len(train_df), len(train_df)//5):
    part = create_prompts(train_df, i, i + len(train_df)//5)
    train_parts.append(part)

In [None]:
train_data = pd.concat(train_parts)
train_dataset = Dataset.from_pandas(train_data)

In [None]:
# Creating validation dataset
val_prompts = create_prompts(val_df, 0, len(val_df))
val_dataset = Dataset.from_pandas(val_prompts)

In [None]:
# Loading model and tokenizer
model_ckpt = "facebook/mbart-large-50"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = MBartForConditionalGeneration.from_pretrained(model_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [None]:
# Check if there's a checkpoint to resume training from
last_checkpoint = None
if os.path.exists(DRIVE_PATH):
    checkpoints = [d for d in os.listdir(DRIVE_PATH) if d.startswith("checkpoint-")]
    if checkpoints:
        last_checkpoint = os.path.join(DRIVE_PATH, sorted(checkpoints)[-1])
        print(f"Found checkpoint to resume from: {last_checkpoint}")

if last_checkpoint:
    model = MBartForConditionalGeneration.from_pretrained(last_checkpoint).to("cuda" if torch.cuda.is_available() else "cpu")
else:
    model = MBartForConditionalGeneration.from_pretrained(model_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Function for converting text to tokens
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(
        example_batch["rus_text"],
        max_length=1024,
        padding="max_length",
        truncation=True
    )
    target_encodings = tokenizer(
        example_batch["ckt_text"],
        max_length=1024,
        padding="max_length",
        truncation=True
    )
    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }

In [None]:
# Converting data
train_dataset_tf = train_dataset.map(convert_examples_to_features, batched=True, remove_columns=["rus_text", "ckt_text"])
val_dataset_tf = val_dataset.map(convert_examples_to_features, batched=True, remove_columns=["rus_text", "ckt_text"])

Map:   0%|          | 0/66586 [00:00<?, ? examples/s]

Map:   0%|          | 0/672 [00:00<?, ? examples/s]

## **Saving dataset to Google Drive**

In [None]:
from datasets import load_from_disk

# Paths for saving mapped datasets
TRAIN_DATASET_PATH = "/content/drive/MyDrive/mbart_datasets/train_dataset_tf"
VAL_DATASET_PATH = "/content/drive/MyDrive/mbart_datasets/val_dataset_tf"

# Saving converted datasets to Google Drive
train_dataset_tf.save_to_disk(TRAIN_DATASET_PATH)
val_dataset_tf.save_to_disk(VAL_DATASET_PATH)

Saving the dataset (0/2 shards):   0%|          | 0/66586 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/672 [00:00<?, ? examples/s]

In [None]:
from datasets import load_from_disk

train_dataset_tf = load_from_disk("/content/drive/MyDrive/mbart_datasets/train_dataset_tf")
val_dataset_tf = load_from_disk("/content/drive/MyDrive/mbart_datasets/val_dataset_tf")

  table = cls._concat_blocks(blocks, axis=0)


In [None]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## **Model Training**

In [None]:
class DriveCheckpointCallback(TrainerCallback):
    def __init__(self, max_to_keep=2):
        self.max_to_keep = max_to_keep

    def on_save(self, args, state, control, **kwargs):
        if state.is_world_process_zero:

            checkpoint_path = os.path.join(LOCAL_PATH, f"checkpoint-{state.global_step}")
            drive_checkpoint_path = os.path.join(DRIVE_PATH, f"checkpoint-{state.global_step}")

            os.system(f'cp -r {checkpoint_path} {drive_checkpoint_path}')
            print(f"\nCheckpoint saved to Google Drive: {drive_checkpoint_path}")

            # Removing old checkpoints if exceeding limit
            checkpoints = [d for d in os.listdir(DRIVE_PATH) if d.startswith("checkpoint-")]
            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))

            if len(checkpoints) > self.max_to_keep:
                for ckpt in checkpoints[:-self.max_to_keep]:
                    path_to_remove = os.path.join(DRIVE_PATH, ckpt)
                    shutil.rmtree(path_to_remove)
                    print(f"Removed old checkpoint: {path_to_remove}")

In [None]:
# Setting training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir=LOCAL_PATH,  # Saving checkpoints locally
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy='steps',
    save_strategy='steps',
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    weight_decay=0.01,
    save_total_limit=3,  # Maximum 3 local checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    learning_rate=2e-5,
    optim="adafactor",
    report_to="none",
    push_to_hub=False,
    resume_from_checkpoint=last_checkpoint if last_checkpoint else None
)

In [None]:
# Creating trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=train_dataset_tf,
    eval_dataset=val_dataset_tf,
    callbacks=[DriveCheckpointCallback(max_to_keep=2)]
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.0647,0.055141
1000,0.06,0.051106
1500,0.0773,0.047944
2000,0.0481,0.045748
2500,0.0543,0.044041
3000,0.0554,0.0437
3500,0.046,0.04306
4000,0.0514,0.042001
4500,0.0557,0.041205
5000,0.0475,0.039979





Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-500

Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-1000

Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-1500
Удален старый чекпоинт: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-500

Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-2000
Удален старый чекпоинт: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-1000

Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-2500
Удален старый чекпоинт: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-1500

Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-3000
Удален старый чекпоинт: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-2000

Чекпоинт сохранен в Google Drive: /content/dr

Step,Training Loss,Validation Loss
500,0.0647,0.055141
1000,0.06,0.051106
1500,0.0773,0.047944
2000,0.0481,0.045748
2500,0.0543,0.044041
3000,0.0554,0.0437
3500,0.046,0.04306
4000,0.0514,0.042001
4500,0.0557,0.041205
5000,0.0475,0.039979



Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-16000
Удален старый чекпоинт: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-15000

Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-16500
Удален старый чекпоинт: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-15500

Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-17000
Удален старый чекпоинт: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-16000

Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-17500
Удален старый чекпоинт: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-16500

Чекпоинт сохранен в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-18000
Удален старый чекпоинт: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/checkpoint-17000

Чекпоинт сохранен в Google Drive: /content/d

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=66586, training_loss=0.047764518145445294, metrics={'train_runtime': 73609.3625, 'train_samples_per_second': 0.905, 'train_steps_per_second': 0.905, 'total_flos': 1.443005027277865e+17, 'train_loss': 0.047764518145445294, 'epoch': 1.0})

In [None]:
final_model_path = os.path.join(DRIVE_PATH, "final_model")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Final model saved to Google Drive: {final_model_path}")

Финальная модель сохранена в Google Drive: /content/drive/MyDrive/mbart_rus_ckt_checkpoints/final_model


## **Model Testing**

In [None]:
import torch

test_prompt = 'Переведи это предложение с русского на чукотский: "Привет, как дела?"'
inputs = tokenizer(test_prompt, return_tensors="pt").input_ids.to("cuda" if torch.cuda.is_available() else "cpu")
outputs = model.generate(inputs, max_length=100)
print("Результат перевода:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Результат перевода: Вот перевод на чукотский: "эйвэ, микыри?"


## **Uploading model to Hugging Face**

In [None]:
from huggingface_hub import notebook_login, HfApi
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

from huggingface_hub import login
login()

# Loading trained model (if needed)
# model = AutoModelForSeq2SeqLM.from_pretrained(final_model_path)
# tokenizer = AutoTokenizer.from_pretrained(final_model_path)

# Setting repository information
repo_name = "mbart50-rus-ckt"
organization = "HSE-Chukchi-NLP"

# Creating repository and uploading model
model.push_to_hub(repo_name, organization=organization)
tokenizer.push_to_hub(repo_name, organization=organization)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HSE-Chukchi-NLP/mbart50-rus-ckt/commit/937f2272291417aa9528b221073fec26d3189914', commit_message='Upload tokenizer', commit_description='', oid='937f2272291417aa9528b221073fec26d3189914', pr_url=None, repo_url=RepoUrl('https://huggingface.co/HSE-Chukchi-NLP/mbart50-rus-ckt', endpoint='https://huggingface.co', repo_type='model', repo_id='HSE-Chukchi-NLP/mbart50-rus-ckt'), pr_revision=None, pr_num=None)