In [None]:
%%capture
!pip3 install torch>=1.3.0
!pip3 install subword-nmt &> log

!wget https://www.dropbox.com/s/qak4r5dtrktx5cu/data.txt?dl=0 -O data.txt # hotel data
!wget https://www.dropbox.com/s/wdc6jeaaprw0rey/ru-en-names-augment.txt?dl=0 -O ru-en-names-augment.txt # ru-en names augmented data
!wget https://www.dropbox.com/s/g1zqqhfb0jlw42h/vocab.py?dl=0 -O vocab.py
!wget https://www.dropbox.com/s/zu3vesqi107pph0/utils.py?dl=0 -O utils.py

In [None]:
from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib inline

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# BPE Tokenizing

In [None]:
# Tokenizing & applying BPE rules

tokenizer = WordPunctTokenizer()
def tokenize(x):
    return ' '.join(tokenizer.tokenize(x.lower()))

# split and tokenize the data
with open('train.en', 'w') as f_src,  open('train.ru', 'w') as f_dst:
    for line in open('data.txt', 'r'):
        src_line, dst_line = line.strip().split('\t')
        f_src.write(tokenize(src_line) + '\n')
        f_dst.write(tokenize(dst_line) + '\n')


# build and apply bpe vocs
bpe = {}
for lang in ['en', 'ru']:
    learn_bpe(open('./train.' + lang), open('bpe_rules.' + lang, 'w'), num_symbols=8000)  # 1. learn_bpe rules
    bpe[lang] = BPE(open('./bpe_rules.' + lang))  # 2. create instance of BPE class
    
    with open('train.bpe.' + lang, 'w') as f_out:
        for line in open('train.' + lang):
            f_out.write(bpe[lang].process_line(line.strip()) + '\n')  # 3. apply BPE tokenization to our data        


# Building vocabularies

data_inp = np.array(open('./train.bpe.ru').read().split('\n'))
data_out = np.array(open('./train.bpe.en').read().split('\n'))

train_inp, dev_inp, train_out, dev_out = train_test_split(data_inp, data_out, test_size=3000,
                                                          random_state=42)
for i in range(3):
    print('inp:', train_inp[i])
    print('out:', train_out[i], end='\n\n')            

from vocab import Vocab

inp_voc = Vocab.from_lines(train_inp) # creates an instance of Vocab class from input lines (ru (input) vocab here)
out_voc = Vocab.from_lines(train_out) # en (output) vocab

print(f'Length of input (Russian) BPE vocabulary = {len(inp_voc)}')
print(f'Length of output (English) BPE vocabulary = {len(out_voc)}')

100%|██████████| 8000/8000 [00:09<00:00, 827.81it/s]
100%|██████████| 8000/8000 [00:09<00:00, 818.20it/s] 


inp: на территории обустроена бесплатная частная парковка .
out: free private parking is available on site .

inp: кроме того , в 5 минутах ходьбы работают многочисленные бары и рестораны .
out: guests can find many bars and restaurants within a 5 - minute walk .

inp: отель san mi@@ gu@@ el расположен в центре мор@@ ели@@ и , в 750 метрах от главной площади города и кафедрального собора .
out: hotel san miguel is located in central more@@ lia , 750 metres from the city ’ s main square and cathedral .

Length of input (Russian) BPE vocabulary = 8048
Length of output (English) BPE vocabulary = 7801


In [None]:
!pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece] wandb

In [None]:
import transformers

from transformers import AutoTokenizer
    
model_checkpoint = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

# Hotels data prep

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to('cuda') # 'cuda' 

In [None]:
from datasets import Dataset, DatasetDict
from datasets import load_dataset, load_metric

def parse_ruencorp(train_inp=train_inp, dev_inp=dev_inp, train_out=train_out, dev_out=dev_out, mode='train'):
    if mode == 'train':
        for src_line, dst_line in zip(train_inp, train_out):
            yield {"translation": {"ru": src_line, "en": dst_line}}
    elif mode == 'test':
        for src_line, dst_line in zip(dev_inp, dev_out):
            yield {"translation": {"ru": src_line, "en": dst_line}}

In [None]:
train_df = pd.DataFrame(parse_ruencorp(mode='train'))
test_df = pd.DataFrame(parse_ruencorp(mode='test'))

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

raw_datasets = DatasetDict({'train': train_dataset, 
                            'test': test_dataset})

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 47001
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
})

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "ru"
target_lang = "en"
def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/47001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 47001
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    logging_strategy="steps",
    logging_steps=150,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    num_train_epochs=5,
    predict_with_generate=True,
    report_to="wandb",
)
    

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from datasets import load_metric
from torchmetrics.text.rouge import ROUGEScore
from nltk.translate.meteor_score import single_meteor_score
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    # Calculate BLEU score
    bleu = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": bleu["score"]}
    # Calculate METEOR
    meteor = np.mean([single_meteor_score(tokenizer.tokenize(ref[0]), tokenizer.tokenize(pred)) for ref, pred in zip(decoded_labels, decoded_preds)])
    result['meteor'] = meteor

    # Calculate ROUGE score
    rouge_score = ROUGEScore()
    rouge = rouge_score(decoded_preds, decoded_labels)
    # result["rouge"] = rouge
    for k,v in rouge.items():
      result[k] = v.item()

    # Calculate predicted lengths
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Inference & BLEU before fine-tuning

## Translations & BLEU before fine-tuning

In [None]:
def translate(src_text):
    translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True).to('cuda'))
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated] 

In [None]:
translate('Водохранилище Селиховское') # Плохо справляется с транслитерацией: Правильный ответ - Vodokhranilishche Selikhovskoye

['Selichovsky Water Reservoir']

In [None]:
translate('поездка до аэропорта домодедово займет 50 минут .')

["It's gonna take 50 minutes to get to the airport."]

In [None]:
translate('по утрам гостям сервируют сладкий завтрак « шведский стол »')

['In the morning, the guests are served with a sweet breakfast of the Swedish table.']

In [None]:
translate('трехзвездочный отель idea piacenza расположен всего в 200 метрах от съезда')

['The three-star idea piacenza hotel is only 200 metres from the exit.']

In [None]:
translate('до красивого средневекового города ротенбург - об - дер - таубер - всего 7 км .')

['To the beautiful medieval city of Rothenburg - o-der-tauber - only 7 km.']

In [None]:
translate('расстояние до бангкока составляет 180 км .')

['The distance to the bancock is 180 km.']

In [None]:
translate('на территории также имеется бассейн с большой солнечной террасой и принадлежности для барбекю .')

['The Territory also has a large solar terrace basin and barbecue supplies.']

In [None]:
from tqdm import tqdm

references = []
predictions = []

with open('no-finetune-all_3k_eval_translations.txt','w') as f_trans:
    for inp_line, out_line in tqdm(zip(dev_inp, dev_out)):
        translated_inp = translate(inp_line)
        inp = inp_line.replace('@@ ', '')
        trans = translated_inp[0].replace('@@ ', '').replace('@@', '').replace('@', '').lower()
        predictions.append(trans)

        out = out_line.replace('@@ ', '')
        references.append(out)
        
        f_trans.write(f'input line: {inp}' + '\n')
        f_trans.write(f'translated line: {trans}' + '\n')
        f_trans.write(f'target line: {out}' + '\n')
        f_trans.write('\n')

3000it [22:07,  2.26it/s]


In [None]:
!head -n 100 no-finetune-all_3k_eval_translations.txt

input line: в распоряжении гостей общая кухня и общая гостиная .
translated line: a common kitchen and a common living room are available to guests.
target line: a shared equipped kitchen and a common living room are provided to guests .

input line: на территории виллы shengsi huajing находится сад и терраса .
translated line: in villa shengsi huajing is the garden and terrace.
target line: at shengsi huajing villa you will find a garden and a terrace .

input line: расстояние от отеля libuše до ближайшей станции метро kobylisy ( линия с ), от которой можно добраться до центрального железнодорожного вокзала праги и центра города , составляет 500 метров .
translated line: the distance from the hotel libuše to the nearest subway station kobylisy (line c ) from which it is possible to reach the main railway station of praga and the city centre is 500 metres.
target line: the nearest metro station at kobylisy , on line c , is set 500 metres from hotel libuše , and it offers connections to

In [None]:
!du -h no-finetune-all_3k_eval_translations.txt

1.2M	no-finetune-all_3k_eval_translations.txt


In [None]:
pip install torchmetrics

In [None]:
import sacrebleu

bleu_score = sacrebleu.corpus_bleu(predictions, [references])
print("BLEU score: ", bleu_score.score)

BLEU score:  11.613923846171362


In [None]:
from nltk.translate.meteor_score import single_meteor_score
meteor_score = np.mean([single_meteor_score(tokenizer.tokenize(ref), tokenizer.tokenize(pred)) for ref, pred in zip(references, predictions)])
print("METEOR score: ", meteor_score)

0.6563934647583041

In [None]:
rouge = rouge_score(predictions, references)

{'rouge1_fmeasure': tensor(0.4570),
 'rouge1_precision': tensor(0.4454),
 'rouge1_recall': tensor(0.4874),
 'rouge2_fmeasure': tensor(0.2017),
 'rouge2_precision': tensor(0.1969),
 'rouge2_recall': tensor(0.2155),
 'rougeL_fmeasure': tensor(0.3859),
 'rougeL_precision': tensor(0.3764),
 'rougeL_recall': tensor(0.4113),
 'rougeLsum_fmeasure': tensor(0.3900),
 'rougeLsum_precision': tensor(0.3804),
 'rougeLsum_recall': tensor(0.4156)}

In [None]:
meteor_score([tokenizer.tokenize(references[0])], tokenizer.tokenize(predictions[0]))

0.6563934647583041

# Fine-tuning on ``hotel data``

In [None]:
pip install wandb --upgrade

In [None]:
import wandb

In [None]:
wandb.init()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Meteor,Rouge1 Fmeasure,Rouge1 Precision,Rouge1 Recall,Rouge2 Fmeasure,Rouge2 Precision,Rouge2 Recall,Rougel Fmeasure,Rougel Precision,Rougel Recall,Rougelsum Fmeasure,Rougelsum Precision,Rougelsum Recall,Gen Len
1000,1.4406,1.272107,32.5697,0.6621,0.6113,0.6373,0.6006,0.3863,0.404,0.3792,0.5406,0.5635,0.531,0.545,0.5682,0.5353,26.73
2000,1.2624,1.126208,34.9626,0.676,0.628,0.653,0.6179,0.4094,0.4263,0.4027,0.5594,0.5812,0.5505,0.5639,0.586,0.5548,26.6533
3000,1.1174,1.046594,37.3224,0.6918,0.6365,0.6522,0.6344,0.4237,0.4349,0.4224,0.5688,0.5824,0.5671,0.5739,0.5877,0.572,27.277
4000,1.0605,1.007228,38.1726,0.6975,0.6481,0.6661,0.6433,0.4355,0.4483,0.4322,0.5821,0.5979,0.5779,0.5873,0.6033,0.583,27.1387
5000,1.0199,0.970884,38.6454,0.6982,0.649,0.671,0.641,0.4366,0.4518,0.4315,0.5804,0.5994,0.5736,0.5864,0.6057,0.5794,26.919
6000,0.9655,0.946799,39.2004,0.7025,0.6529,0.6717,0.6476,0.4419,0.4551,0.4385,0.5852,0.6014,0.5807,0.5903,0.6069,0.5856,27.106
7000,0.9293,0.930782,39.3803,0.7041,0.6543,0.6713,0.6502,0.4439,0.4557,0.4413,0.5857,0.6005,0.5822,0.5917,0.6069,0.588,26.9773
8000,0.9013,0.914293,40.1263,0.7079,0.661,0.6803,0.655,0.4529,0.4662,0.4493,0.594,0.6106,0.589,0.5996,0.6165,0.5943,26.952
9000,0.8765,0.902407,40.499,0.7082,0.66,0.679,0.6543,0.4522,0.4657,0.4483,0.5936,0.61,0.5888,0.5989,0.6157,0.594,27.0417
10000,0.8865,0.89297,40.4705,0.7083,0.6616,0.6819,0.6548,0.4538,0.4682,0.4493,0.5941,0.6117,0.5882,0.6,0.6179,0.5938,26.8973


TrainOutput(global_step=14690, training_loss=1.012709327941853, metrics={'train_runtime': 6961.4023, 'train_samples_per_second': 33.758, 'train_steps_per_second': 2.11, 'total_flos': 5564188318629888.0, 'train_loss': 1.012709327941853, 'epoch': 5.0})

translations after fine-tuning

In [None]:
translate('поездка до аэропорта домодедово займет 50 минут .')

['domodedovo airport is a 50 - minute drive away.']

In [None]:
translate('по утрам гостям сервируют сладкий завтрак « шведский стол »')

['a sweet breakfast buffet is served every morning.']

In [None]:
translate('трехзвездочный отель idea piacenza расположен всего в 200 метрах от съезда')

['the 3 - star idea pic@@ enza hotel is just 200 metres from the exit.']

In [None]:
translate('до красивого средневекового города ротенбург - об - дер - таубер - всего 7 км .')

['the beautiful medieval town of rot@@ en@@ burg - o@@ der tab@@ er is just 7 km away.']

In [None]:
translate('расстояние до бангкока составляет 180 км .')

['bangkok is 180 km away.']

In [None]:
translate('на территории также имеется бассейн с большой солнечной террасой и принадлежности для барбекю .')

['there is also a swimming pool with a large sun terrace and barbecue facilities.']

In [None]:
from tqdm import tqdm

references = []
predictions = []

with open('finetune-all_3k_eval_translations.txt','w') as f_trans:
    for inp_line, out_line in tqdm(zip(dev_inp, dev_out)):
        translated_inp = translate(inp_line)
        inp = inp_line.replace('@@ ', '')
        trans = translated_inp[0].replace('@@ ', '').replace('@@', '').replace('@', '').lower()
        predictions.append(trans)

        out = out_line.replace('@@ ', '')
        references.append(out)
        
        f_trans.write(f'input line: {inp}' + '\n')
        f_trans.write(f'translated line: {trans}' + '\n')
        f_trans.write(f'target line: {out}' + '\n')
        f_trans.write('\n')

3000it [15:23,  3.25it/s]


## Saving fine-tuned model weights to drive.google

In [None]:
torch.save(model.state_dict(), 'fine-tuned-model.pt')

In [None]:
import pickle

# Get the model weights as a dictionary
weights = model.state_dict()

# Save the weights as a pickle file
with open("transformer_weights_after_fine_tuning.pkl", "wb") as f:
    pickle.dump(weights, f)

### Augmentation JW300 En data

In [None]:
!wget https://www.dropbox.com/s/yk5s1x5wf7xxfue/JW300-en-ru-train.tsv?dl=0 -O JW300-en-ru-data-augmentation.tsv # data for augmentation

--2023-04-25 06:10:47--  https://www.dropbox.com/s/yk5s1x5wf7xxfue/JW300-en-ru-train.tsv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:6025:18::a27d:4512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/yk5s1x5wf7xxfue/JW300-en-ru-train.tsv [following]
--2023-04-25 06:10:48--  https://www.dropbox.com/s/raw/yk5s1x5wf7xxfue/JW300-en-ru-train.tsv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc25aa95f3f0d8991d41bb415cb3.dl.dropboxusercontent.com/cd/0/inline/B61SS6QSx1_l2yDoVtABC9mKuFC_Ktlj-HzdxJbenwmDOxv6GioVkE6wFq1kWJECiuPrymFlCc_UsaC7R7ITx3M5p8mAM6c-gVwBmVBWfY_7M0co3O7R-DlCvn88Cua136eIEaUJT2ew1S37F56c3kjqKxudrcriH-zz_ERq6nFt3g/file# [following]
--2023-04-25 06:10:48--  https://uc25aa95f3f0d8991d41bb415cb3.dl.dropboxusercontent.com/cd/0/inline/B61SS6QSx1_l2yDoVtABC9mKuFC_Ktlj-HzdxJbenwmDOxv

In [None]:
import pandas as pd
augmentation_data = pd.read_csv('JW300-en-ru-data-augmentation.tsv', delimiter='\t', header=None, names=['en','ru'])
# pd.read_csv('ru-en-names-augment.txt', delimiter='\t', header=None, names=['ru','en'])

In [None]:
translated_augmentation_data = []

In [None]:
for sent in translated_augmentation_data.to_list():
  print(sent)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Возможно, что инфляция лишила тебя привычки позволять себе кое - что вне очереди.
Может быть, в твоем распоряжении еще никогда не было « дополнительных » денег, или потребности твоей семьи важнее твоих собственных.
Некоторые женщины хотят ограничить свою работу по специальности, чтобы посвящать больше времени особой добровольной деятельности — что уже сделали многие христианки, чтобы стать полновременными проповедницами библейской благой вести.
Как можно тогда, несмотря на ограниченные средства, быть изящно одетой и чувствовать себя довольной?
Не робей, этого можно добиться.
Сначала обратимся к шкафу для одежды
Давай сначала сделаем инвентаризацию.
Лучше всего опорожнить весь шкаф.
Рассортируй одежду на две кучи: на любимые и на нелюбимые.
Любимые — это те платья, которые ты часто одеваешь, в которых ты чувствуешь себя хорошо и которые тебе идут, а нелюбимые — это те платья, которые или устарели или больше тебе не впору.


KeyboardInterrupt: ignored

In [None]:
for sent in augmentation_data.to_list():
  print(sent)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
It was not uncommon for children to have to work off the company - store debts inherited from their father.
Note, for example, a part of an editorial, appearing in a New York newspaper in 1872: “Sometimes generation after generation works to pay back debts begun by their grandfathers.
Those who have a few coins in their pockets earn them by menial labor after working long hours in the earth. ” So it was that with no other place to go and no money to leave, the miners became slaves to the mineowners.
Since child - labor laws were not then known, mine operators took advantage of young males, sending them into the mines at a very early age to work long hours in cramped spaces where only their small bodies would fit.
Some as young as five would work topside separating coal from the slate as it moved along on conveyor belts, their fingers and hands often crushed out of shape.
Others, exhausted from 14 hours of work, fell into 

KeyboardInterrupt: ignored

In [None]:
augmentation_data = augmentation_data.dropna()

In [None]:
train_inp_augmented, dev_inp_augmented, train_out_augmented, dev_out_augmented = train_test_split(augmentation_data['ru'], 
                                                                                                  augmentation_data['en'],
                                                                                                  test_size=15000, 
                                                                                                  random_state=42)

In [None]:
train_inp_augmented

148555    Один фермер жаловался: « Не деревья, а колы то...
647330              Теперь оно тоже должно формироваться *.
53071                                            Честность.
821852    Из любви Бог предоставил выкуп, открывающий пу...
786097                        [ Иллюстрация на странице 20]
                                ...                        
267976    Дети вскоре замечают, что животные спариваются...
380780    Родственники были встревожены и пытались воспр...
135414                                                 ПЕРУ
692379    Именно любовь к духовным братьям и сестрам поб...
125211    Как по телефону можно позвонить в другой конец...
Name: ru, Length: 979868, dtype: object

In [None]:
train_out_augmented

148555    One grower lamented: “The trees are just spike...
647330                      These too must now be molded. *
53071                                             N., Japan
821852    Love moved God to provide a ransom that would ...
786097                                [ Picture on page 20]
                                ...                        
267976    The process of mating is soon noticed, followe...
380780    Our relatives were alarmed and began to oppose...
135414                                                ANDES
692379    Love for his spiritual brothers and sisters mo...
125211    Just as a telephone enables you to talk to som...
Name: en, Length: 979868, dtype: object

In [None]:
dev_inp_augmented

831082             10 — Как Эдом был « истреблен навсегда »?
689117     « А слушающий меня [Божью мудрость] будет жить...
348215     Попробуй взглянуть на ситуацию так: представь,...
1020651                             Видения пророка Захарии.
426313     Затем Иегова начал „творить чудеса среди них“,...
                                 ...                        
519901     Они действительно принесли всю десятину в дом ...
874976            У него было пристрастие к табаку и гашишу.
849106     Одной сестре, которая активно служила Иегове о...
233903     ДЖОН, о котором упоминалось в первой статье, п...
544567     Она настаивала, что вы – коммунисты, о чем гов...
Name: ru, Length: 15000, dtype: object

In [None]:
dev_out_augmented

831082     vs 10 ​ — How was Edom “cut off to time indefi...
689117     “ As for the one listening to [godly wisdom], ...
348215     Look at it this way: Suppose you’re going on a...
1020651             Chariots and a Crown Safeguard You, Oct.
426313     Jehovah then started to ‘ do wonderful things ...
                                 ...                        
519901     They have truly brought the whole tithe into t...
874976     He was addicted to tobacco and hashish, which ...
849106     For instance, a sister who served Jehovah acti...
233903     JOHN, mentioned in the preceding article, beca...
544567     She insisted that you are Communists, as the C...
Name: en, Length: 15000, dtype: object

**Back-translation**

In [None]:
sample_0 = augmentation_data['en'][0]
sample_0

'World War I'

In [None]:
reversed_model_checkpoint = "Helsinki-NLP/opus-mt-en-ru"
reversed_model = AutoModelForSeq2SeqLM.from_pretrained(reversed_model_checkpoint).to('cuda') # 'cuda' 

In [None]:
reversed_model.translate(sample_0)

In [None]:
def parse_ruencorp(train_inp=train_inp_augmented, dev_inp=dev_inp_augmented, train_out=train_out_augmented, dev_out=dev_out_augmented, mode='train'):
    if mode == 'train':
        for ru_sample, en_sample in zip(train_inp_augmented, train_out_augmented):
            yield {"translation": {"ru": ru_sample, "en": en_sample}}
    elif mode == 'test':
        for src_line, dst_line in zip(dev_inp_augmented, dev_out_augmented):
            yield {"translation": {"ru": src_line, "en": dst_line}}

In [None]:
train_df = pd.DataFrame(parse_ruencorp(mode='train'))
test_df = pd.DataFrame(parse_ruencorp(mode='test'))

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

raw_datasets_augmentation = DatasetDict({'train': train_dataset, 
                                         'test': test_dataset})

In [None]:
raw_datasets_augmentation

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 979868
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 15000
    })
})

In [None]:
tokenized_datasets_augmentation = raw_datasets_augmentation.map(preprocess_function, batched=True)

In [None]:
tokenized_datasets_augmentation

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 994868
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [None]:
raw_datasets_augmentation['train'][0]

{'translation': {'en': 'World War I', 'ru': 'Первая мировая война'}}

## Dataloaders for augmentation

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-augmented-{source_lang}-to-{target_lang}",
    logging_strategy="steps",
    logging_steps=150,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    num_train_epochs=2,
    predict_with_generate=True,
    report_to="wandb",
)
    

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## Fine-tuning on autgmented data

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets_augmentation["train"],
    eval_dataset=tokenized_datasets_augmentation["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# train on augmented (999k), validation on hotel descriptions (3k)
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Meteor,Rouge1 Fmeasure,Rouge1 Precision,Rouge1 Recall,Rouge2 Fmeasure,Rouge2 Precision,Rouge2 Recall,Rougel Fmeasure,Rougel Precision,Rougel Recall,Rougelsum Fmeasure,Rougelsum Precision,Rougelsum Recall,Gen Len
1000,2.1972,1.664838,25.6799,0.6478,0.6129,0.6321,0.6095,0.3949,0.4077,0.3931,0.5459,0.5628,0.5429,0.5503,0.5675,0.5469,23.092
2000,2.1101,1.897227,24.4881,0.6402,0.6067,0.6269,0.6014,0.3849,0.3981,0.3819,0.5389,0.5566,0.5344,0.5433,0.5614,0.5385,22.9893


Step,Training Loss,Validation Loss,Bleu,Meteor,Rouge1 Fmeasure,Rouge1 Precision,Rouge1 Recall,Rouge2 Fmeasure,Rouge2 Precision,Rouge2 Recall,Rougel Fmeasure,Rougel Precision,Rougel Recall,Rougelsum Fmeasure,Rougelsum Precision,Rougelsum Recall,Gen Len
1000,2.1972,1.664838,25.6799,0.6478,0.6129,0.6321,0.6095,0.3949,0.4077,0.3931,0.5459,0.5628,0.5429,0.5503,0.5675,0.5469,23.092
2000,2.1101,1.897227,24.4881,0.6402,0.6067,0.6269,0.6014,0.3849,0.3981,0.3819,0.5389,0.5566,0.5344,0.5433,0.5614,0.5385,22.9893
3000,2.0304,2.018572,22.9299,0.631,0.5945,0.615,0.5892,0.3667,0.3798,0.3637,0.5249,0.5431,0.52,0.529,0.5476,0.5239,23.1243
4000,2.0259,2.127951,21.751,0.624,0.5856,0.607,0.5795,0.3543,0.3678,0.3505,0.5164,0.5355,0.5106,0.5207,0.5402,0.5147,23.062
5000,1.9803,2.198995,21.2035,0.6176,0.58,0.5997,0.5758,0.3503,0.3634,0.3474,0.5115,0.5291,0.5075,0.5156,0.5336,0.5113,23.214
6000,1.9111,2.299289,20.5521,0.6143,0.5741,0.5938,0.5698,0.3403,0.353,0.3376,0.5055,0.523,0.5015,0.5092,0.5271,0.505,23.1997


KeyboardInterrupt: ignored

translations after augmenting

In [None]:
translate('поездка до аэропорта домодедово займет 50 минут .')

['It takes 50 minutes to drive to Domodedadogo airport.']

In [None]:
translate('по утрам гостям сервируют сладкий завтрак « шведский стол »')

['A sweet breakfast is served buffet buffet']

In [None]:
translate('трехзвездочный отель idea piacenza расположен всего в 200 метрах от съезда')

['3 - star lea piacenza hotel is located only 200 meters [200 m] from the convention']

In [None]:
translate('до красивого средневекового города ротенбург - об - дер - таубер - всего 7 км .')

['The beautiful medieval town of Rottenburg - ob - der - Tauber is only 7 km away.']

In [None]:
translate('расстояние до бангкока составляет 180 км .')

['The distance to bangcock is 180 km.']

In [None]:
translate('на территории также имеется бассейн с большой солнечной террасой и принадлежности для барбекю .')

['There is also a pool with a large sun terrace and barbecue facilities.']

In [None]:
from tqdm import tqdm

references = []
predictions = []

with open('augmentation-all_3k_eval_translations.txt','w') as f_trans:
    for inp_line, out_line in tqdm(zip(dev_inp, dev_out)):
        translated_inp = translate(inp_line)
        inp = inp_line.replace('@@ ', '')
        trans = translated_inp[0].replace('@@ ', '').replace('@@', '').replace('@', '').lower()
        predictions.append(trans)

        out = out_line.replace('@@ ', '')
        references.append(out)
        
        f_trans.write(f'input line: {inp}' + '\n')
        f_trans.write(f'translated line: {trans}' + '\n')
        f_trans.write(f'target line: {out}' + '\n')
        f_trans.write('\n')

3000it [15:04,  3.32it/s]


# Fine-tuning on augmented ``ru-en-names_data``

## Data prep

In [None]:
ru_en_names = pd.read_csv('ru-en-names-augment.txt', delimiter='\t', header=None, names=['ru','en'])
ru_en_names.head(10)

Unnamed: 0,ru,en
0,Явидово,Yavidovo
1,Ступнево,Stupnevo
2,Стукшино,Stukshino
3,Строевичи,Stroyevichi
4,Страна Советов,Strana Sovetov
5,Урочище Ставрово,Urochishche Stavrovo
6,Старое Малиново,Staroye Malinovo
7,Старое Китово,Staroye Kitovo
8,Сотский,Sotskiy
9,Слапихино,Slapikhino


In [None]:
ru_names = ru_en_names.ru.to_list()
en_names = ru_en_names.en.to_list()

In [None]:
print(ru_names[:5])
print(en_names[:5])

['Явидово', 'Ступнево', 'Стукшино', 'Строевичи', 'Страна Советов']
['Yavidovo', 'Stupnevo', 'Stukshino', 'Stroyevichi', 'Strana Sovetov']


In [None]:
from datasets import load_metric
metric = load_metric("sacrebleu")

from datasets import Dataset, DatasetDict
from datasets import load_dataset, load_metric

def parse_ruencorp(train_inp=ru_names, dev_inp=dev_inp, train_out=en_names, dev_out=dev_out, mode='train'):
    if mode == 'train':
        for ru_name, en_name in zip(ru_names, en_names):
            yield {"translation": {"ru": ru_name, "en": en_name}}
    elif mode == 'test':
        for src_line, dst_line in zip(dev_inp, dev_out):
            yield {"translation": {"ru": src_line, "en": dst_line}}

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [None]:
train_df = pd.DataFrame(parse_ruencorp(mode='train'))
test_df = pd.DataFrame(parse_ruencorp(mode='test'))

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

raw_datasets_augmentation = DatasetDict({'train': train_dataset, 
                                         'test': test_dataset})

In [None]:
raw_datasets_augmentation

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 187459
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
})

In [None]:
raw_datasets_augmentation['train'][0]

{'translation': {'en': 'Yavidovo', 'ru': 'Явидово'}}

In [None]:
# check len distribution after 

In [None]:
max_input_length = 128
max_target_length = 128
source_lang = "ru"
target_lang = "en"
def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples['translation']]
    targets = [ex[target_lang] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets_augmented = raw_datasets_augmentation.map(preprocess_function, batched=True)

Map:   0%|          | 0/187459 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets_augmented

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 187459
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

## Dataloaders for fine-tuning

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    logging_strategy="steps",
    logging_steps=150,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    num_train_epochs=2,
    predict_with_generate=True,
    report_to="wandb",
)
    

PyTorch: setting up devices


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Fine-tuning on autgmented data

In [None]:
translate('Водохранилище Селиховское')

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



['the sil@@ kh@@ ov@@ skaya water storage room']

In [None]:
ru_names[:5]

['Явидово', 'Ступнево', 'Стукшино', 'Строевичи', 'Страна Советов']

In [None]:
translate(ru_names[:5])

['I can see it.',
 'Stopnevo',
 'Stukchino',
 'Strojevichs',
 'Country of the Councils']

In [None]:
ru_names[-5:]

['Озеро Полевое', 'Урочище Битюг', 'Гнилушка', 'Урочище Деево', 'Аркаим']

In [None]:
translate(ru_names[-5:])

['Field Lake',
 'The Beatug Beautiful',
 "You're a rotten bastard.",
 "Dejevo's Daylight.",
 'Arkaim']

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets_augmented["train"],
    eval_dataset=tokenized_datasets_augmented["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 187459
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 23434
  Number of trainable parameters = 76147712
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Bleu,Gen Len
6000,0.3407,2.899605,11.8269,26.5223


The following columns in the evaluation set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3000
  Batch size = 16
Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,

KeyboardInterrupt: ignored

In [None]:
for inp_line, out_line in tqdm(zip(dev_inp[:500], dev_out[:500])):
    translated_inp = translate(inp_line)
    inp = inp_line.replace('@@ ', '')
    trans = translated_inp[0].replace('@@ ', '').replace('@@', '').replace('@', '').lower()
    predictions.append(trans)

    out = out_line.replace('@@ ', '')
    references.append(out)
    
    print(f'input line: {inp}' + '\n')
    print(f'translated line: {trans}' + '\n')
    print(f'target line: {out}' + '\n')
    print()
    # f_trans.write(f'input line: {inp}' + '\n')
    # f_trans.write(f'translated line: {trans}' + '\n')
    # f_trans.write(f'target line: {out}' + '\n')
    # f_trans.write('\n')

0it [00:00, ?it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

1it [00:00,  1.30it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в распоряжении гостей общая кухня и общая гостиная .

translated line: ostanovochnyy punkt punkt

target line: a shared equipped kitchen and a common living room are provided to guests .




2it [00:01,  1.13it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: на территории виллы shengsi huajing находится сад и терраса .

translated line: villy shengsi i huazhing features a garden and terrasa.

target line: at shengsi huajing villa you will find a garden and a terrace .




3it [00:02,  1.05s/it]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние от отеля libuše до ближайшей станции метро kobylisy ( линия с ), от которой можно добраться до центрального железнодорожного вокзала праги и центра города , составляет 500 метров .

translated line: stantsiya libuge station kobylisy station (line c ) is 500 metres away.

target line: the nearest metro station at kobylisy , on line c , is set 500 metres from hotel libuše , and it offers connections towards prague main train station and the centre of the city .




4it [00:03,  1.10it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: во всех зонах общественного пользования предоставляется бесплатный wifi .

translated line: in all public areas, free wifi access is available.

target line: free wi - fi access is available in all public areas .




5it [00:04,  1.08it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в нескольких минутах ходьбы от отеля расположены центр кемера и порт .

translated line: punkt kemera i port within a 5 - minute walk of the property.

target line: within walking distance , guests can reach the centre and harbour of kemer .




6it [00:06,  1.21s/it]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: персонал предоставит вам полезную туристическую информацию и внимательное обслуживание , необходимое для расслабленного отдыха .

translated line: ostanovochnyy punkt ostanovochnyy punkt

target line: staff provide lots of helpful tourist information and an attentive service , making you feel at home .




7it [00:06,  1.00s/it]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: на всей территории апартаментов работает бесплатный wi - fi .

translated line: partamentov features free wifi throughout the property.

target line: free wifi is offered throughout the property .




8it [00:07,  1.27it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние до монако составляет 1 км . для гостей организуются услуги трансфера .

translated line: monako is 1 km away and shuttle services are available for guests.

target line: monaco is 1 km from the apartments and there is a shuttle service available .




9it [00:07,  1.61it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: к услугам гостей ресторан , а также номера с телевизором с плоским экраном и бесплатным доступом в интернет .

translated line: ostanovochnyy punkt punkt

target line: featuring a restaurant , it offers rooms with flat - screen tvs and free internet .




10it [00:08,  1.70it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: этот отель расположен в 10 минутах езды от границы с сша и международного аэропорта тихуаны .

translated line: ostanovochnyy punkt

target line: this hotel is within a 10 - minute drive of the us border and from the tijuana international airport .




11it [00:08,  1.99it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в числе других удобств - продуктовые поставки и снэк - бар .

translated line: ukhdobstov-techniy postavki i snekbar

target line: other facilities offered at the property include grocery deliveries and a snack bar .




12it [00:08,  2.17it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: за 5 минут можно дойти до гондольного подъемника каскейд .

translated line: gondol'nogo punkt kaskeyd can be reached within 5 minutes.

target line: the cascade gondola is 5 minutes ' walk away .




13it [00:09,  1.98it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: кондиционер , сейф , фен , телефон с прямым набором номера , а также круглосуточная стойка регистрации .

translated line: ostanovochnyy punkt ostanovochnyy punkt ostanovochnyy punkt

target line: bathroom , air - co , private safe , hairdryer , direct dial phone , plus a 24 hr reception .




14it [00:10,  1.65it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в номерах отеля residencial melba есть ванная комната и спутниковое телевидение .

translated line: ostanovochnyy punkt ostanovochnyy punkt vannanyy punkt and satellite tv.

target line: guest rooms of the residencial melba have a private bathroom and satellite tv .




15it [00:10,  1.75it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

16it [00:10,  2.19it/s]

input line: апартаменты располагают гостиной зоной , телевизором и балконом .

translated line: ostanovochnyy punkt ostanovochnyy punkt

target line: accommodation will provide you with a tv , a balcony and a seating area .


input line: до парка « биртри » можно доехать за 30 минут .

translated line: birtri park is 30 minutes'drive away.

target line: bear tree park is a 30 - minute drive away .




Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

17it [00:11,  2.38it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: каждое утро в шале сервируют континентальный завтрак , состоящий из яиц и домашнего джема .

translated line: chale serves a continental breakfast consisting of yaits and homemade jams every morning.

target line: a continental breakfast including eggs and homemade jams is provided every morning .




18it [00:11,  2.68it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в общей ванной комнате установлена ванна или душ .

translated line: dovgoye vannoyal includes a vanna or shower.

target line: there is a shared bathroom with a bath or shower .




19it [00:11,  3.05it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние до аэропорта риека составляет 10 км .

translated line: rirekha airport is 10 km away.

target line: the nearest airport is rijeka airport , 10 km from the property .




20it [00:12,  2.37it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: мотель inn adams находится примерно в 4 , 8 км от тематического парка adventureland и в 8 , 8 км от поля для гольфа highland oaks .

translated line: motel inn-aadams is around 4. 8 km from thematic park advenureland and 8. 8 km from gol'fa gikhland oaks

target line: inn adams is about 3 miles from adventureland theme park , and is 5 . 5 miles from highland oaks golf course .




21it [00:12,  2.63it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в распоряжении гостей отдельные номера в разных зданиях и доступ в интернет на всей территории .

translated line: there are separate rooms in different buildings and internet access throughout the property.

target line: internet access is available in all areas .




22it [00:13,  2.60it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в ванной комнате предоставляется фен и бесплатные туалетно - косметические принадлежности .

translated line: vannoy room includes a hairdryer and free toiletnonono-kosmeticheskiy.

target line: the bathroom includes a hairdryer and free toiletries .




23it [00:13,  2.31it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: гостевой дом casa brasileira с общей кухней , прачечной и лаунджем расположен в центре города куритиба , всего в 200 метрах от площади эспанья .

translated line: gostevoy kasabrasileira is located in the centre of kuritiba, just 200 metres from espan'ya square.

target line: centrally located in curitiba , just 200 metres from espanha square , casa brasileira features a communal kitchen , laundry and living room .




24it [00:13,  2.36it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: по предварительному заказу гости могут принять сенные ванны и посетить другие косметические процедуры .

translated line: pristan' stantsiya punkt punkt vannyes and kosmet'iches

target line: hay baths and cosmetic treatments can also be booked .




25it [00:14,  2.54it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: к услугам гостей ежедневный домашний завтрак и общий зал с телевизором .

translated line: it features a daily homemade breakfast and a shared room with a tv.

target line: a daily homemade breakfast is offered and there is a shared tv room .




26it [00:15,  1.91it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: поместье dunkeld manor расположено в зеленом пригороде йоханнесбурга данкельде и предлагает гостям тихий отдых всего в 10 минутах езды от центра йоханнесбурга .

translated line: pomenst'ye dunkeld manor is located in greennyy prigo-yokhanesburga dankel'de.

target line: situated in the tree - lined suburbs of dunkeld , dunkeld manor offers a quiet retreat just a 10 - minute drive from the johannes city centre .




27it [00:15,  1.84it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: на окрестных улицах размещаются бары , рестораны и кафе , а на пляже работают пункты проката велосипедов и снаряжения для серферов .

translated line: okrest'nyy street with bars, restaurants and cafes. punkty bike and serferov equipment are available on the beach.

target line: bars , restaurants and cafés can be found in the surrounding streets . surf and bike rental shops can be found along the beach .




28it [00:16,  2.09it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в распоряжении гостей также бесплатная частная парковка и номера с чайником .

translated line: there is free private parking at the property.

target line: it offers free private parking and rooms with a kettle .




29it [00:16,  2.28it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: на территории обустроена бесплатная частная парковка .

translated line: ooustroena provides free private parking onsite.

target line: free private parking is available on site .




30it [00:16,  2.06it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: эти апартаменты с собственной кухней находятся в городе савонлинна , в 3 км от крепости олафсборг .

translated line: these self-yukhney apartments are located in savonlinna, 3 km from olafsburg.

target line: this savonlinna property offers self - catering accommodation within 3 km of olavinlinna castle .




31it [00:18,  1.51it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: дом сконструирован таким образом , что стоит непосредственно над водой . юные гости весело проведут время на детской игровой площадке .

translated line: skonstruirovanka-obrazom, which stands directly above the water.

target line: it features a lovely construction over the water and a children ’ s playground .




32it [00:18,  1.55it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: номера в отеле regina оснащены собственной ванной комнатой с ванной или душем . гости могут посмотреть программы кабельного телевидения .

translated line: nomers at the regina have a private bathroom with a vannoy or shower, while cablenogo tv is available.

target line: the guest rooms at the regina are equipped with cable tv and a private bathroom with bath or shower .




33it [00:18,  1.79it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: к услугам гостей бесплатный wifi на всей территории и бесплатная ночная автомобильная экскурсия по городу палермо .

translated line: ostanovochnyy punkt punkt pallermo

target line: it features self - catering accommodation with free wi - fi throughout and a free car tour of palermo city by night .




34it [00:19,  2.08it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: этот отель находится в киеве , в 250 метрах от станции метро « осокорки ».

translated line: stantsiya osokokhki station, set in kiyev'ye.

target line: located in kiev , this hotel is 250 metres from osokorky metro station .




35it [00:19,  2.06it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: апартаменты ela находятся в 1 , 9 км от музея античного стекла и дворца генерал - губернатора .

translated line: ostanovochnyy punkt gubernatara

target line: the museum of ancient glass is 1 . 9 km from apartment ela , while palace of the governor general is 1 . 9 km away .




36it [00:20,  2.47it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние до международного аэропорта пхукета составляет 35 км .

translated line: pkhuketa international airport is 35 km away.

target line: phuket international airport is 35 km away .




37it [00:20,  2.66it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: по вечерам организуется живая музыка и развлечения .

translated line: zhivaya music and razvlecheniy

target line: live music and entertainment are organised during the evening .




38it [00:20,  2.15it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: поездка до музея искусств , науки и техники в здании бывшего бассейна занимает 18 минут .

translated line: stantsiya ostanovochnyy punkt uchebtsativnyy punkt ostanovochnyy punkt punkt 18 minutes.

target line: la piscine art and science museum is a 18 - minute drive away .




39it [00:21,  2.30it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: также в распоряжении гостей телевизор и собственная ванная комната с душем и биде .

translated line: there is also a tv and a private bathroom with shower andbide.

target line: a tv is available . there is a private bathroom with a bidet and shower .




40it [00:21,  2.25it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: центр бодрума находится в 23 км от комплекса , а расстояние до аэропорта милас - бодрум составляет 60 км .

translated line: bedruma center is 23 km from the property, while milas-bodrum airport is 60 km away.

target line: bodrum city centre is 23 km from the property . milas - bodrum airport is 60 km away .




41it [00:22,  2.64it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: гостям предоставляется бесплатная парковка на территории отеля .

translated line: gost'ye offers free parking on site.

target line: free private parking is possible on site .




42it [00:22,  2.37it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: среди прочих удобств сад , бесплатный wifi и бесплатная частная парковка на территории .

translated line: ostanovochnyy punkt ostanovochnyy punkt ostanovochnyy punkt

target line: additional features include a garden , free wi - fi access and free private parking on site .




43it [00:22,  2.74it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: гостям апартаментов « на шарикоподшипниковской » предоставляются полотенца и постельное белье .

translated line: gostii ostanovochnyy polotentsa

target line: towels and bed linen are available at apartment na sharikopodshipnikovskoy .




44it [00:23,  2.62it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние от номеров aleksandra с проживанием в семье до пляжа яз составляет 3 , 6 км , а до курорта свети - стефан — 6 км .

translated line: ostanovochnyy punkt alek-sandra is 3. 6 km away and 6 km away.

target line: jaz beach is 3 . 6 km from homestay aleksandra , while sveti stefan is 6 km from the property .




45it [00:23,  2.36it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: комплекс privat maria находится в городе липтовски ян , в 1 км от термального комплекса termal paradise и располагает апартаментами с собственной кухней , садом с принадлежностями для приготовления барбекю и террасой .

translated line: priivat maria is located in liptovski-yan, 1 km away from termal podise teratel'nyy and offers self - catering accommodation with gardens with barbekus and terrasoy.

target line: situated in the town of liptovský ján and the thermal paradise liptovsky jan reachable within 1 km , privat mária offers a self - catered accommodation , a garden with barbecue facilities and a terrace .




46it [00:23,  2.70it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: все номера отеля on chekhova оформлены в классическом стиле и оснащены рабочим столом и ванной комнатой с душем .

translated line: otel' on khekhova has a work desk and ennogo room with shower.

target line: each room at hotel on chekhova is decorated in a classic style and includes a work desk . a shower is provided in the bathrooms .




47it [00:24,  3.04it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: все номера уникально оформлены . в них работает бесплатный wi - fi .

translated line: stantsiya unikal'no is decorated with free wifi.

target line: all unique guest rooms offer free wi - fi .




48it [00:24,  2.91it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: поездка до городов котка и хамина занимает 1 час , а до городов ловийса и коувола — полчаса .

translated line: kot'ka i khamina is 1 hour  drive away. loviysa and koupol'a are 1 hours  drive away.

target line: kotka and hamina can be reached by car in about 1 hour and loviisa kouvola in half an hour .




49it [00:24,  3.16it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

50it [00:25,  3.62it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: некоторые из них имеют собственные ванные комнаты , некоторые располагают доступом к общей ванной комнате .

translated line: some have private bathrooms and some have access to shared bathrooms.

target line: some have private and some shared bathroom facilities .


input line: расстояние до аэропорта вроцлав - коперник составляет 88 км .

translated line: vtrotslav-kopernik airport is 88 km away.

target line: wroclaw – copernicus airport is 88 km away .




51it [00:25,  3.80it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: на террасе также можно заказать напитки и полюбоваться видом на горы .

translated line: ostanovochnyy punkt ostanovochnyy punkt terasa

target line: you can also relax with drinks and enjoy the mountain views .




52it [00:25,  4.06it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние до ближайших ресторанов составляет 50 метров .

translated line: stantsaranov is 50 metres away.

target line: restaurants are located 50 metres away .




53it [00:25,  4.08it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в числе удобств бесплатный wi - fi и бесплатная парковка .

translated line: udobstove free vi-fi and free parking spaces.

target line: just 10 minutes from newcastle , there is free wi - fi and free parking .




54it [00:26,  3.25it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

55it [00:26,  3.83it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: апартаменты victoria one находятся в пределах пешей досягаемости от некоторых театров , а также всего в нескольких минутах ходьбы от площади винсент - сквер .

translated line: ostanovochnyy punkt victoriye onee is within walking distance of some teatrovs. vinsent-skver square is just minutes  walk away.

target line: a selection of theatres are within easy walking distance from victoria one , and vincent square is just a few minutes ’ walk away .


input line: расстояние до знаменитых ворот адриана составляет менее 100 метров .

translated line: stantsiya adriana

target line: antalya ' s iconic hadrianus gate can be found less than 100 metres away .




56it [00:26,  3.29it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: на круглосуточной стойке регистрации работает камера хранения багажа , а с 20 : 00 до 02 : 00 можно заказать услуги массажа .

translated line: stantsiya ostanovochnyy punkt 24 hours a day. massage service can be arranged from 20 : 00 to 06 : 00 : 00.

target line: massages can be arranged from 20 : 00 - 02 : 00 at the 24 - hour reception , which can also store luggage for guests .




57it [00:27,  2.90it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: к услугам гостей крытый бассейн , который открыт круглый год , и гидромассажная ванна .

translated line: ostanovochnyy punkt ostanovochnyy punkt ostanovochnyy punkt

target line: a year round indoor pool and hot tub are offered on - site .




58it [00:27,  2.96it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние от гостевого дома fazenda virá до водопада прудентополис составляет 70 км .

translated line: urochishche urochishche prudentopolis 70 km

target line: prudentópolis waterfall is 70 km from fazenda virá .




59it [00:27,  3.06it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние до мемориального парка с кораблем вмс сша « алабама » составляет 16 км .

translated line: memorial'nogo parks korablem vmass ssha alabama is 16 km away.

target line: the uss alabama battleship park is 10 miles away .




60it [00:28,  3.00it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние до аэропорта лангкави составляет 8 , 9 км , а поездка до паромного терминала лангкави займет 30 минут .

translated line: langkav'i airport is 8. 9 km away and it takes 30 minutes by car to ferogogonyy pangkavy

target line: langkawi airport is 8 . 9 km from the property while langkawi ferry terminal is a 30 - minute drive away .




61it [00:28,  2.68it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в апарт - отеле olympic tower somerset к услугам гостей красиво оформленные апартаменты с открытой планировкой , качественной мебелью в минималистском стиле , роскошным декором и полностью оборудованной кухней .

translated line: olympictototoversomerset provides charmingly decorated apartments with an open plan.

target line: coming with open - plan layout and quality furniture of minimalist style , the beautifully furnished apartments at olympic tower somerset feature luxurious décor and full kitchen facilities .




62it [00:29,  2.45it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: отель springhill suites находится в 16 км от парка аттракционов ноттс - берри - фарм и в 6 , 4 км от стадиона эйнджел в городе анахайме .

translated line: springill suites is 16 km from attrapritsativnyov notts-berri-pharm, 6. 4 km from eyndzhel' in anakhayme

target line: knott ' s berry farm is 16 km away from springhill suites . angel stadium of anaheim is 6 . 4 km away .




63it [00:29,  2.31it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

64it [00:29,  2.78it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: просторные виллы курорта payogan в балийском стиле находятся в окружении тропических садов . они оснащены телевизорами с плоским экраном , спутниковым телевидением и dvd - плеерами .

translated line: prosynyy villas of payogan in balisk styla, surrounded by tropical rays, come with flat - screen tvs, satellite tvs and dvd-leyerami.

target line: surrounded by tropical gardens , the spacious balinese - style villas at payogan come with flat - screen satellite tvs and dvd players .


input line: апартаменты также располагают кухней с обеденной зоной .

translated line: ostanovochnyy punkt gukhney.

target line: the apartments also include a kitchen with dining area .




65it [00:29,  3.23it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

66it [00:30,  3.67it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: напитки можно заказать в баре в течение дня .

translated line: beveretsy can be enjoyed at the bar during the day.

target line: there is also a bar offering drinks during the day .


input line: музей « уолтонс маунтин » находится в 30 км от мотеля .

translated line: voltons mountin museum is 30 km from motel'

target line: the walton ' s mountain museum is 19 miles from the motel .




67it [00:30,  2.44it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: четырехзвездочный отель holiday inn bursa расположен в 16 км от исторического центра бурсы , в горукле — студенческом городке университета улудаг . окна с одной стороны отеля выходят на сосновый лес , а с другой — на гору улудаг .

translated line: 4 4zvezdochnyy hotel kholiday inn bursa is 16 km from the historic centre of bursy. ingoerle-studendechskkom of uludag university

target line: located 16km from the historical centre of bursa , the holiday inn is a 4 - star hotel on the görükle campus of uludag university . the hotel is surrounded by a pine forest on one side and overlooks uludag mountain on the other .




68it [00:31,  2.71it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: предоставляется постельное белье .

translated line: ostanovochnyy punkt zadel'noye

target line: bed linen is featured .




69it [00:31,  3.04it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние от апартаментов terranova до аэропорта трапани составляет 12 км .

translated line: tapani airport is 12 km away.

target line: the nearest airport is trapani airport , 12 km from terranova appartamenti .




70it [00:31,  2.79it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: апартаменты « на невского , 51а » находятся в калининграде , в 800 м от района марауненхоф .

translated line: ostanovochnyy punkt aartement na nepskogo 51a is located in kaliningrade, 800 metres from maraunenkhof.

target line: apartment at nevskogo 51a offers accommodation in kaliningrad . the apartment is 800 metres from maraunenhof .




71it [00:32,  2.57it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: отель president kovilovo расположен в тихом месте , в 8 км от центра белграда . к услугам гостей открытый бассейн , спа - салон и оздоровительный центр .

translated line: pristan' president kovilovo is set in a quiet setting 8 km from downtown belgrada. it features an outdoor pool, spa salon and wellness centre.

target line: offering an outdoor swimming pool and a spa and wellness centre , hotel president kovilovo enjoys a quiet location , 8 km from the centre of belgrade .




72it [00:33,  1.83it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: она дает право на бесплатное пользование канатной дорогой и автобусами на всей территории долины отцталь , а также на бесплатное посещение развлекательного парка ареа 47 , бассейнов , поля для мини - гольфа и других достопримечательностей .

translated line: it offers free use of the canatnoy dorogoy, as well as the autobusam throughout the bol'nogo ottstal'. there is also free entry to the entertainment park area 47, the swimming pools, minimali-gol'fa course and other sites of interest.

target line: it entitles the holder to free use of cable cars and buses in the entire ötztal valley and offers free access to the area 47 adventure park , swimming pools , mini golf and much more .




73it [00:34,  1.47it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: семейный отель la maison borann расположен в городе сиемреап . в этом отеле прекрасно сочетаются кхмерское и европейское гостеприимство . гости оценят возможность отдохнуть вдали от шумных улиц , а также освежиться купанием в открытом бассейне с морской водой .

translated line: urochishche familyhotel'lamason borann is located in siemreap. kkhmerskoye and egopeyskoye gosttepriimstvo.

target line: offering a retreat from the busy streets of siem reap , the family - owned la maison borann boutique hotel features a cooling outdoor saltwater pool .




74it [00:34,  1.78it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: на территории обустроена детская игровая площадка и работает ресторан .

translated line: obustroena features a children's playground and there is a restaurant on site.

target line: hotel ashwini lodge also includes a children ' s playground . guests can enjoy a meal at the on - site restaurant .




75it [00:34,  2.09it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: на территории апартаментов rose обустроены сад и площадка для барбекю , а также работает снэк - бар .

translated line: ostanovochnyy punkt

target line: at rose apartment you will find a garden , barbecue facilities and a snack bar .




76it [00:35,  1.97it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в зонах общественного пользования работает бесплатный wifi , а на территории этого курортного отеля можно заняться такими видами активного отдыха , как сноркелинг и гребля на каноэ .

translated line: ostanovochnyy punkt public areas include free wifi, while onsite activities like snorkeling and groblya na kanoye can be enjoyed on site.

target line: free wi - fi access is available in the public areas , while activities such as snorkelling and canoeing can be enjoyed on site .




77it [00:35,  1.92it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: за дополнительную плату предоставляется трансфер от / до аэропорта дубровника , который расположен в 23 км от апартаментов .

translated line: ostanovnyy punkt ostanovnyy punkt punkt dubrovnika airport, 23 km away.

target line: dubrovnik airport is 23 km away and shuttle can be arranged at a surcharge .




78it [00:36,  1.80it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: отель « мир невский » расположен в центре санкт - петербурга , в 7 минутах ходьбы от дворцовой площади и эрмитажа .

translated line: ostanovochnyy punkt ostanovochnyy punkt

target line: mir nevsky hotel is located in the heart of saint petersburg , a 7 - minute walk from the palace square and the state hermitage museum .




79it [00:37,  1.67it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние до международного аэропорта имени никоса казандзакиса составляет 70 км .

translated line: ostanovochnyy punkt ostanovochnyy punkt ostanovochnyy punkt ostanovochnyy punkt 70 km away

target line: nikos kazantzakis airport is located 70 km away .




80it [00:38,  1.50it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: отель типа « постель и завтрак » l ' antico borgo расположен на склоне холма в средневековом городке доссо , в 10 минутах езды от пляжа леванто .

translated line: ostanovochnyy punkt ostanovochnyy punkt l'antiko borgo

target line: located 10 minutes ' drive from levanto ’ s beach , l ' antico borgo b & b is set in the medieval hillside town of dosso .




81it [00:38,  1.88it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

82it [00:38,  2.37it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: собственная ванная комната укомплектована феном .

translated line: sovotel'nyy vannaya room includes a hairdryer.

target line: private bathrooms include a hairdryer .


input line: в некоторых номерах установлены сейф и рабочий стол .

translated line: some units include a safe and work desk.

target line: some rooms have a safe and a work desk .




83it [00:38,  2.49it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

84it [00:38,  3.06it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: из номеров открывается вид на бассейн .

translated line: ostanovochnyy punkt ostanovochnyy punkt

target line: you can enjoy pool view from the room .


input line: в числе удобств каждого номера — телевизор и собственная ванная комната .

translated line: every room includes a tv and a private bathroom.

target line: each room includes a tv . every room is equipped with a private bathroom .




85it [00:39,  2.99it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: апартаменты l ' ora di torbole 1 находятся в 47 км от города сирмионе и в 40 км от горнолыжного курорта мадонна - ди - кампильо .

translated line: syrmione is 47 km from l'ora di torbole 1, 40 km from magonna di-kampil'vo

target line: sirmione is 47 km from apartment l ' ora di torbole 1 , while madonna di campiglio is 40 km away .




86it [00:39,  3.04it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: продовольственные магазины и рестораны находятся в городе . до реки дордонь 10 минут езды .

translated line: prodol'stvennyye shops and restaurantsorany can be found in gorod dordon'

target line: grocery shops and restaurants can be found in the town and the edge of the dordogne river is a 10 - minute drive away .




87it [00:39,  3.20it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: этот отель расположен в центре города юбы , менее чем в 1 минуте езды от торговой улицы юба саттер .

translated line: set in the centre of yuby, less than 1 minute  drive from yuba satter

target line: this downtown yuba city hotel is less than a 1 - minute drive to yuba sutter mall .




88it [00:40,  2.97it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в отеле имеется открытый бассейн , и организуются уроки йоги и танцев .

translated line: ostanovochnyy punkt ostanovochnyy punkt ostanovochnyy punkt

target line: it offers an outdoor pool and activities like yoga and dance classes .




89it [00:40,  3.31it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: вы также можете заказать сеанс массажа или позагорать на террасе у бассейна .

translated line: you can also order a massage session or dance on the terrasa by the pool.

target line: for relaxation , guests can schedule a massage or sunbathe on the poolside terrace .




90it [00:40,  3.25it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: гостей приглашают отдохнуть с книгой или журналом в библиотеке и поиграть в бильярд .

translated line: ostanovochnyy punkt

target line: guests can relax with a book or magazine in the library or play billiards .




91it [00:41,  3.31it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: он расположен в 300 метрах от старого города мосбаха , который окружен лесом оденвальд .

translated line: it is 300 metres from mosbakha, an old town surrounded by forest odenval'd

target line: it is 300 metres from mosbach ' s old town , which is surrounded by the odenwald forest .




92it [00:41,  3.55it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: континентальный завтрак сервируется каждое утро .

translated line: ostanovochnyy punkt pervadtsativnyy every morning

target line: a continental breakfast is served daily .




93it [00:41,  2.96it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: расстояние от отеля fairway до национального музея уганды составляет 2 , 5 км , до международного аэропорта энтеббе — 45 км , а до мечети каддафи — 4 км .

translated line: urochishche fairvay punkt national museum ugandy is 2. 5 km away. entebbe international airport is 45 km away. kaddafi is 4 km from the property.

target line: the uganda national museum is within 2 . 5 km of fairway hotel & spa and entebbe international airport is within 45 km . gaddafi mosque is 4 km away from the property .




94it [00:42,  2.89it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: собственная ванная комната с душем укомплектована феном и бесплатными туалетно - косметическими принадлежностями .

translated line: sovotel'nyy vannaya room comes with showers, hair and free toiletnonono-kosmetichestyy

target line: featuring a shower , private bathroom also comes with a hairdryer and free toiletries .




95it [00:42,  3.21it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

96it [00:42,  3.84it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в некоторых номерах есть док - станция для ipod и балкон .

translated line: in some of the rooms, dok station for ipod and balkon

target line: some rooms offer an ipod dock and balcony .


input line: кроме того , в распоряжении гостей ванная комната с душем .

translated line: vannaya room comes with a shower.

target line: the bathroom comes with a shower .




97it [00:43,  2.86it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: этот современный отель расположен недалеко от костариканской столицы сан - хосе . на территории обустроены открытый бассейн и круглосуточный фитнес - центр , работает казино , несколько ресторанов и баров , а также предоставляется бесплатный wi - fi .

translated line: stantsiya near kotarikanskogo san-khose

target line: this modern hotel near costa rica ’ s capital of san jose features a casino , free wi - fi and a 24 - hour fitness centre . guests enjoy several on - site restaurants and bars and an outdoor swimming pool .




98it [00:43,  3.23it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: из окон открывается красивый вид на город . до города таранто 48 км .

translated line: ostanovochnyy punkt punkt 48 km

target line: the property boasts views of the city is 48 km from taranto .




99it [00:43,  3.33it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: номера - студио отеля типа « постель и завтрак » viva pompei оформлены в строгом стиле , а из их окон открывается вид на исторический центр города .

translated line: nomernyy studio otdel' i-brebok viva pompei

target line: studios at b & b viva pompei are simply furnished and overlook the town ’ s historic centre .




100it [00:43,  3.44it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: в числе прочих удобств — dvd - плеер и телевизор с плоским экраном .

translated line: ostanovochnyy punkt dvd-player and a flat - screen tv.

target line: each comes with a dvd player and a flat - screen tv .




101it [00:44,  3.04it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

102it [00:44,  3.56it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

102it [00:44,  2.29it/s]

input line: гости могут посещать сезонный открытый бассейн .

translated line: ostanovochnyy punkt ostanovochnyy punkt ostanovochnyy punkt

target line: other facilities at pearl one include a seasonal outdoor pool .


input line: за несколько минут от отеля можно дойти до множества разных мест развлечений .

translated line: a variety of activities can be reached within minutes of the hotel.

target line: a variety of entertainment options can be explored in a walking distance of the hotel .







KeyboardInterrupt: ignored

In [None]:
from tqdm import tqdm

references = []
predictions = []

with open('augmented-and-fine-tuned-all_3k_eval_translations.txt','w') as f_trans:
    for inp_line, out_line in tqdm(zip(dev_inp[:500], dev_out[:500])):
        translated_inp = translate(inp_line)
        inp = inp_line.replace('@@ ', '')
        trans = translated_inp[0].replace('@@ ', '').replace('@@', '').replace('@', '').lower()
        predictions.append(trans)

        out = out_line.replace('@@ ', '')
        references.append(out)
        
        f_trans.write(f'input line: {inp}' + '\n')
        f_trans.write(f'translated line: {trans}' + '\n')
        f_trans.write(f'target line: {out}' + '\n')
        f_trans.write('\n')

0it [00:00, ?it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

1it [00:00,  3.70it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

2it [00:00,  3.80it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

3it [00:00,  3.01it/s]Generate config GenerationConfig {
  "b

KeyboardInterrupt: ignored

In [None]:
translate('Водохранилище Селиховское')

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



['Vodokhranilishche Selikhovskoye']

In [None]:
ru_names[:5]

['Явидово', 'Ступнево', 'Стукшино', 'Строевичи', 'Страна Советов']

In [None]:
translate(ru_names[:5])

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



['Yavidovo',
 'Stupnevo',
 'Stukshino',
 'Stroyevichi',
 'Ostanovochnyy Punkt Sovetov']

In [None]:
ru_names[-5:]

['Озеро Полевое', 'Урочище Битюг', 'Гнилушка', 'Урочище Деево', 'Аркаим']

In [None]:
translate(ru_names[-5:])

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



['Ozero Polevoye',
 'Urochishche Bityug',
 'Gnilushka',
 'Urochishche Deyevo',
 'Arkaim']

In [None]:
translate('на проспекте кирова')

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



['On Profsektoye Kirova']

In [None]:
translate('расстояние до города тамбор составляет 44 км')

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



['Razvaliny Tambar 44 km']

# Helpful funtions

save model weights as pickle file (and upload it to your disk)

In [None]:
import pickle

# Get the model weights as a dictionary
weights = model.state_dict()

# Save the weights as a pickle file
with open("model_weights_no_fine_tuning.pkl", "wb") as f:
    pickle.dump(weights, f)

load .pickle weights from disk.google

In [None]:

# Load the saved weights
with open("model_weights.pkl", "rb") as f:
    saved_weights = pickle.load(f)

# Set the model weights to saved weights
model.load_state_dict(saved_weights)

# Use the loaded model for inference

mount drive.google

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/NMT/

/content/drive/MyDrive/NMT


In [None]:
%ls

'=1.3.0'                        [0m[01;34mcheckpoint-10000[0m/   train.bpe.en   utils.py
 all_3k_eval_translations.txt   data.txt            train.bpe.ru   vocab.py
 bpe_rules.en                   log                 train.en       [01;34mwandb[0m/
 bpe_rules.ru                   [01;34m__pycache__[0m/        train.ru


In [None]:
!cp -r /content/opus-mt-ru-en-finetuned-ru-to-en/checkpoint-10000 "/content/drive/MyDrive/NMT/"

In [None]:
!du -h checkpoint-10000

877M	checkpoint-10000


get model weights and check its memory allocation

In [None]:
import pickle

# Get the model weights as a dictionary
weights = model.state_dict()

# Save the weights as a pickle file
with open("model_weights_no_fine_tuning.pkl", "wb") as f:
    pickle.dump(weights, f)

In [None]:
%ls

'=1.3.0'        data.txt       train.bpe.en   train.ru
 bpe_rules.en   log            train.bpe.ru   vocab.py
 bpe_rules.ru   [0m[01;34m__pycache__[0m/   train.en


In [None]:
!du -h model_weights.pkl

du: cannot access 'model_weights.pkl': No such file or directory


In [None]:
model.parameters