In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/NMT/

/content/drive/MyDrive/NMT


In [None]:
%ls

'=1.3.0'        log                                 train.bpe.ru   vocab.py
 bpe_rules.en   [0m[01;34mopus-mt-ru-en-finetuned-ru-to-en[0m/   train.en
 bpe_rules.ru   [01;34m__pycache__[0m/                        train.ru
 data.txt       train.bpe.en                        utils.py


In [None]:
%%capture
!pip3 install torch>=1.3.0
!pip3 install subword-nmt &> log

In [None]:
from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib inline

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# BPE Tokenizing

In [None]:
# Tokenizing & applying BPE rules

tokenizer = WordPunctTokenizer()
def tokenize(x):
    return ' '.join(tokenizer.tokenize(x.lower()))

# split and tokenize the data
with open('train.en', 'w') as f_src,  open('train.ru', 'w') as f_dst:
    for line in open('data.txt', 'r'):
        src_line, dst_line = line.strip().split('\t')
        f_src.write(tokenize(src_line) + '\n')
        f_dst.write(tokenize(dst_line) + '\n')


# build and apply bpe vocs
bpe = {}
for lang in ['en', 'ru']:
    learn_bpe(open('./train.' + lang), open('bpe_rules.' + lang, 'w'), num_symbols=8000)  # 1. learn_bpe rules
    bpe[lang] = BPE(open('./bpe_rules.' + lang))  # 2. create instance of BPE class
    
    with open('train.bpe.' + lang, 'w') as f_out:
        for line in open('train.' + lang):
            f_out.write(bpe[lang].process_line(line.strip()) + '\n')  # 3. apply BPE tokenization to our data        


# Building vocabularies

data_inp = np.array(open('./train.bpe.ru').read().split('\n'))
data_out = np.array(open('./train.bpe.en').read().split('\n'))

train_inp, dev_inp, train_out, dev_out = train_test_split(data_inp, data_out, test_size=3000,
                                                          random_state=42)
for i in range(3):
    print('inp:', train_inp[i])
    print('out:', train_out[i], end='\n\n')            

from vocab import Vocab

inp_voc = Vocab.from_lines(train_inp) # creates an instance of Vocab class from input lines (ru (input) vocab here)
out_voc = Vocab.from_lines(train_out) # en (output) vocab

print(f'Length of input (Russian) BPE vocabulary = {len(inp_voc)}')
print(f'Length of output (English) BPE vocabulary = {len(out_voc)}')

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8000/8000 [00:28<00:00, 281.40it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8000/8000 [00:26<00:00, 296.81it/s]


inp: –Ω–∞ —Ç–µ—Ä—Ä–∏—Ç–æ—Ä–∏–∏ –æ–±—É—Å—Ç—Ä–æ–µ–Ω–∞ –±–µ—Å–ø–ª–∞—Ç–Ω–∞—è —á–∞—Å—Ç–Ω–∞—è –ø–∞—Ä–∫–æ–≤–∫–∞ .
out: free private parking is available on site .

inp: –∫—Ä–æ–º–µ —Ç–æ–≥–æ , –≤ 5 –º–∏–Ω—É—Ç–∞—Ö —Ö–æ–¥—å–±—ã —Ä–∞–±–æ—Ç–∞—é—Ç –º–Ω–æ–≥–æ—á–∏—Å–ª–µ–Ω–Ω—ã–µ –±–∞—Ä—ã –∏ —Ä–µ—Å—Ç–æ—Ä–∞–Ω—ã .
out: guests can find many bars and restaurants within a 5 - minute walk .

inp: –æ—Ç–µ–ª—å san mi@@ gu@@ el —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω –≤ —Ü–µ–Ω—Ç—Ä–µ –º–æ—Ä@@ –µ–ª–∏@@ –∏ , –≤ 750 –º–µ—Ç—Ä–∞—Ö –æ—Ç –≥–ª–∞–≤–Ω–æ–π –ø–ª–æ—â–∞–¥–∏ –≥–æ—Ä–æ–¥–∞ –∏ –∫–∞—Ñ–µ–¥—Ä–∞–ª—å–Ω–æ–≥–æ —Å–æ–±–æ—Ä–∞ .
out: hotel san miguel is located in central more@@ lia , 750 metres from the city ‚Äô s main square and cathedral .

Length of input (Russian) BPE vocabulary = 8048
Length of output (English) BPE vocabulary = 7801


In [None]:
!pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece] wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m469.0/469.0 KB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.3/6.3 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m118.9/118.9 KB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.9

In [None]:
import transformers

from transformers import AutoTokenizer
    
model_checkpoint = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding=False)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt-ru-en/snapshots/39fdcea592bdeb244fa87ce823e9f5c70d3a2bc3/config.json
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-ru-en",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 62517,
  "decoder_vocab_size": 62518,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "L

# Data Prep

In [None]:
from datasets import Dataset, DatasetDict
from datasets import load_dataset, load_metric

def parse_ruencorp(train_inp=train_inp, dev_inp=dev_inp, train_out=train_out, dev_out=dev_out, mode='train'):
    if mode == 'train':
        for src_line, dst_line in zip(train_inp, train_out):
            yield {"translation": {"ru": src_line, "en": dst_line}}
    elif mode == 'test':
        for src_line, dst_line in zip(dev_inp, dev_out):
            yield {"translation": {"ru": src_line, "en": dst_line}}

In [None]:
train_df = pd.DataFrame(parse_ruencorp(mode='train'))
test_df = pd.DataFrame(parse_ruencorp(mode='test'))

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

raw_datasets = DatasetDict({'train': train_dataset, 
                            'test': test_dataset})

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 47001
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
})

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "ru"
target_lang = "en"
def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/47001 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 47001
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to('cuda') # 'cuda' 

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt-ru-en/snapshots/39fdcea592bdeb244fa87ce823e9f5c70d3a2bc3/config.json
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-ru-en",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 62517,
  "decoder_vocab_size": 62518,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "L

In [None]:
import pickle

# Get the model weights as a dictionary
weights = model.state_dict()

# Save the weights as a pickle file
with open("model_weights_no_fine_tuning.pkl", "wb") as f:
    pickle.dump(weights, f)

In [None]:
%ls

'=1.3.0'        data.txt       train.bpe.en   train.ru
 bpe_rules.en   log            train.bpe.ru   vocab.py
 bpe_rules.ru   [0m[01;34m__pycache__[0m/   train.en


In [None]:
!du -h model_weights.pkl

du: cannot access 'model_weights.pkl': No such file or directory


In [None]:
model.parameters

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    logging_strategy="steps",
    logging_steps=150,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,
    predict_with_generate=True,
    report_to="wandb",
    
)

PyTorch: setting up devices


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from datasets import load_metric
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

  metric = load_metric("sacrebleu")


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Inference & BLEU computing before fine-tuning

## Translations & BLEU before fine-tuning

In [None]:
def translate(src_text):
    translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True).to('cuda'))
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated] 

In [None]:
# test on 3000/700 samples 
#
#

# from tqdm import tqdm

# references = []
# predictions = []

# for inp_line, out_line in tqdm(zip(dev_inp[::700], dev_out[::700])):
#     translated_inp = translate(inp_line)
#     inp = inp_line.replace('@@ ', '')
#     #trans = translated_inp[0].replace('@@ ', '') # –¥–æ–ª–∂–Ω–æ –±—ã—Ç—å –≤–æ—Ç —Ç–∞–∫ –∏ —Ä–∞–±–æ—Ç–∞—Ç—å –¥–æ–ª–∂–Ω–æ –Ω–æ—Ä–º–∞–ª—å–Ω–æ
#     trans = translated_inp[0].replace('@@ ', '').replace('@@', '').replace('@', '').lower()
#     predictions.append(trans)

#     out = out_line.replace('@@ ', '')
#     references.append(out)
    
#     print(f'input line: {inp}')
#     print(f'translated line: {trans}')
#     print(f'target line: {out}')
#     print()

In [None]:
from tqdm import tqdm

references = []
predictions = []

with open('all_3k_eval_translations.txt','w') as f_trans:
    for inp_line, out_line in tqdm(zip(dev_inp, dev_out)):
        translated_inp = translate(inp_line)
        inp = inp_line.replace('@@ ', '')
        trans = translated_inp[0].replace('@@ ', '').replace('@@', '').replace('@', '').lower()
        predictions.append(trans)

        out = out_line.replace('@@ ', '')
        references.append(out)
        
        f_trans.write(f'input line: {inp}' + '\n')
        f_trans.write(f'translated line: {trans}' + '\n')
        f_trans.write(f'target line: {out}' + '\n')
        f_trans.write('\n')

[1;30;43m–í—ã—Ö–æ–¥–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –±—ã–ª–∏ –æ–±—Ä–µ–∑–∞–Ω—ã –¥–æ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –ø–æ—Å–ª–µ–¥–Ω–∏—Ö —Å—Ç—Ä–æ–∫ (5000).[0m
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

2688it [19:13,  2.55it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

2689it [19:14,  2.05it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

2690it [19:14,  2.42it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,

In [None]:
!head -n 100 all_3k_eval_translations.txt

input line: –≤ —Ä–∞—Å–ø–æ—Ä—è–∂–µ–Ω–∏–∏ –≥–æ—Å—Ç–µ–π –æ–±—â–∞—è –∫—É—Ö–Ω—è –∏ –æ–±—â–∞—è –≥–æ—Å—Ç–∏–Ω–∞—è .
translated line: a common kitchen and a common living room are available to guests.
target line: a shared equipped kitchen and a common living room are provided to guests .

input line: –Ω–∞ —Ç–µ—Ä—Ä–∏—Ç–æ—Ä–∏–∏ –≤–∏–ª–ª—ã shengsi huajing –Ω–∞—Ö–æ–¥–∏—Ç—Å—è —Å–∞–¥ –∏ —Ç–µ—Ä—Ä–∞—Å–∞ .
translated line: in villa shengsi huajing is the garden and terrace.
target line: at shengsi huajing villa you will find a garden and a terrace .

input line: —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ –æ—Ç –æ—Ç–µ–ª—è libu≈°e –¥–æ –±–ª–∏–∂–∞–π—à–µ–π —Å—Ç–∞–Ω—Ü–∏–∏ –º–µ—Ç—Ä–æ kobylisy ( –ª–∏–Ω–∏—è —Å ), –æ—Ç –∫–æ—Ç–æ—Ä–æ–π –º–æ–∂–Ω–æ –¥–æ–±—Ä–∞—Ç—å—Å—è –¥–æ —Ü–µ–Ω—Ç—Ä–∞–ª—å–Ω–æ–≥–æ –∂–µ–ª–µ–∑–Ω–æ–¥–æ—Ä–æ–∂–Ω–æ–≥–æ –≤–æ–∫–∑–∞–ª–∞ –ø—Ä–∞–≥–∏ –∏ —Ü–µ–Ω—Ç—Ä–∞ –≥–æ—Ä–æ–¥–∞ , —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç 500 –º–µ—Ç—Ä–æ–≤ .
translated line: the distance from the hotel libu≈°e to the nearest subway station kobylisy (line c ) from which it

In [None]:
!du -h all_3k_eval_translations.txt

1.2M	all_3k_eval_translations.txt


In [None]:
import sacrebleu

bleu_score = sacrebleu.corpus_bleu(predictions, [references])
print("BLEU score: ", bleu_score.score)

BLEU score:  11.746981346412316


# Fine-tuning 

In [None]:
pip install wandb --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import wandb

In [None]:
wandb.init()

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 47001
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 14690
  Number of trainable parameters = 76147712


Step,Training Loss,Validation Loss,Bleu,Gen Len
1000,1.3799,1.22665,34.0002,27.278
2000,1.2204,1.101541,36.1139,27.1727
3000,1.1304,1.032956,37.5792,27.3607
4000,1.0308,0.995135,37.9636,27.072
5000,1.0127,0.96472,39.1453,27.1093
6000,0.9413,0.940124,39.4129,26.8637
7000,0.9195,0.927302,39.6679,26.833
8000,0.9216,0.908887,40.3452,26.8543
9000,0.8542,0.89894,40.7226,26.9917
10000,0.864,0.890871,41.0983,27.0743


[1;30;43m–í—ã—Ö–æ–¥–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –±—ã–ª–∏ –æ–±—Ä–µ–∑–∞–Ω—ã –¥–æ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –ø–æ—Å–ª–µ–¥–Ω–∏—Ö —Å—Ç—Ä–æ–∫ (5000).[0m
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_toke

KeyboardInterrupt: ignored

In [None]:
trainer.log_metrics

<bound method log_metrics of <transformers.trainer_seq2seq.Seq2SeqTrainer object at 0x7f8537bb0eb0>>

## Saving fine-tuned model weights to drive.google

In [None]:
torch.save(model.state_dict(), 'fine-tuned-model.pt')

In [None]:
import pickle

# Get the model weights as a dictionary
weights = model.state_dict()

# Save the weights as a pickle file
with open("transformer_weights_after_fine_tuning.pkl", "wb") as f:
    pickle.dump(weights, f)

## Fine-tuned transformer translations

In [None]:
# test on 3000/700 samples 
#
#

from tqdm import tqdm

references = []
predictions = []

for inp_line, out_line in tqdm(zip(dev_inp[::500], dev_out[::500])):
    translated_inp = translate(inp_line)
    inp = inp_line.replace('@@ ', '')
    #trans = translated_inp[0].replace('@@ ', '') # –¥–æ–ª–∂–Ω–æ –±—ã—Ç—å –≤–æ—Ç —Ç–∞–∫ –∏ —Ä–∞–±–æ—Ç–∞—Ç—å –¥–æ–ª–∂–Ω–æ –Ω–æ—Ä–º–∞–ª—å–Ω–æ
    trans = translated_inp[0].replace('@@ ', '').replace('@@', '').replace('@', '').lower()
    predictions.append(trans)

    out = out_line.replace('@@ ', '')
    references.append(out)
    
    print(f'input line: {inp}')
    print(f'translated line: {trans}')
    print(f'target line: {out}')
    print()

0it [00:00, ?it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

1it [00:00,  6.17it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

2it [00:00,  5.93it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: –≤ —Ä–∞—Å–ø–æ—Ä—è–∂–µ–Ω–∏–∏ –≥–æ—Å—Ç–µ–π –æ–±—â–∞—è –∫—É—Ö–Ω—è –∏ –æ–±—â–∞—è –≥–æ—Å—Ç–∏–Ω–∞—è .
translated line: a common kitchen and a common living room are available to guests.
target line: a shared equipped kitchen and a common living room are provided to guests .

input line: –∫—Ä–æ–º–µ —Ç–æ–≥–æ , –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª—è–µ—Ç—Å—è –ø—Ä–æ–∫–∞—Ç –≤–µ–ª–æ—Å–∏–ø–µ–¥–æ–≤ , —É—Å–ª—É–≥–∏ —Ç—Ä–∞–Ω—Å—Ñ–µ—Ä–∞ –∏ –±–µ—Å–ø–ª–∞—Ç–Ω–∞—è –ø–∞—Ä–∫–æ–≤–∫–∞ .
translated line: in addition, bike rentals, transfer services and free parking are provided.
target line: bicycle rental and shuttle services are also available . the property offers free parking .



3it [00:00,  5.11it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ –¥–æ –≥–æ—Ä–æ–¥–∞ –∫–∏—Å—Å–∏–º–º–∏ —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç 26 –∫–º .
translated line: the distance to the town of kissimmmi is 26 km.
target line: the unit is 26 km from kissimmee .



4it [00:00,  3.62it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}



input line: –∞–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã –≤ –ø–µ–Ω—Ç—Ö–∞—É—Å–µ —Å –æ–±—â–∏–º –æ—Ç–∫—Ä—ã—Ç—ã–º –±–∞—Å—Å–µ–π–Ω–æ–º , —Å–∞–¥–æ–º , –∫–æ–Ω–¥–∏—Ü–∏–æ–Ω–µ—Ä–æ–º –∏ —Ç–µ—Ä—Ä–∞—Å–æ–π –¥–ª—è –∑–∞–≥–∞—Ä–∞ —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω—ã –≤ 5 –º–∏–Ω—É—Ç–∞—Ö —Ö–æ–¥—å–±—ã –æ—Ç –ø–ª—è–∂–∞ –Ω–∞ –∫—É—Ä–æ—Ä—Ç–µ –∫–∞–±–æ - —Ä–æ–π .
translated line: apartments in penthouse with a shared open pool, garden, air conditioning and tanning terrace are located 5 minutes from the beach at the ka resortbo roy.
target line: situated 5 minutes ' walk from the beach in cabo roig , this air - conditioned penthouse apartment features a communal outdoor pool , garden and sun terrace .



5it [00:01,  3.35it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

6it [00:01,  4.11it/s]

input line: –∞–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã moscow point - loft red square –Ω–∞—Ö–æ–¥—è—Ç—Å—è –≤ –º–æ—Å–∫–≤–µ , –≤ 200 –º–µ—Ç—Ä–∞—Ö –æ—Ç –±–æ–ª—å—à–æ–≥–æ —Ç–µ–∞—Ç—Ä–∞ .
translated line: apartments moscow point - loft red square are located in moscow, 200 metres from the big theater.
target line: moscow point - loft red square offers accommodation in moscow . the apartment is 200 metres from bolshoi theatre .

input line: –≤ –≤–∞—à–µ–º —Ä–∞—Å–ø–æ—Ä—è–∂–µ–Ω–∏–∏ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–∞—è –≤–∞–Ω–Ω–∞—è –∫–æ–º–Ω–∞—Ç–∞ —Å –¥—É—à–µ–º –∏ –ø–æ–ª–æ—Ç–µ–Ω—Ü–∞–º–∏ .
translated line: you have your own bathroom with showers and towels.
target line: featuring a shower , private bathrooms also come with towels .






In [None]:
from tqdm import tqdm

references = []
predictions = []

with open('fine-tuned-all_3k_eval_translations.txt','w') as f_trans:
    for inp_line, out_line in tqdm(zip(dev_inp, dev_out)):
        translated_inp = translate(inp_line)
        inp = inp_line.replace('@@ ', '')
        trans = translated_inp[0].replace('@@ ', '').replace('@@', '').replace('@', '').lower()
        predictions.append(trans)

        out = out_line.replace('@@ ', '')
        references.append(out)
        
        f_trans.write(f'input line: {inp}' + '\n')
        f_trans.write(f'translated line: {trans}' + '\n')
        f_trans.write(f'target line: {out}' + '\n')
        f_trans.write('\n')

[1;30;43m–í—ã—Ö–æ–¥–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –±—ã–ª–∏ –æ–±—Ä–µ–∑–∞–Ω—ã –¥–æ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –ø–æ—Å–ª–µ–¥–Ω–∏—Ö —Å—Ç—Ä–æ–∫ (5000).[0m
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

2688it [17:18,  2.73it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

2689it [17:19,  1.77it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 62517,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 62517,
  "transformers_version": "4.26.1"
}

2690it [17:20,  2.02it/s]Generate config GenerationConfig {
  "bad_words_ids": [
    [
      62517
    ]
  ],
  "bos_token_id": 0,

# Helpful funtions

save model weights as pickle file (and upload it to your disk)

In [None]:
import pickle

# Get the model weights as a dictionary
weights = model.state_dict()

# Save the weights as a pickle file
with open("model_weights_no_fine_tuning.pkl", "wb") as f:
    pickle.dump(weights, f)

load .pickle weights from disk.google

In [None]:

# Load the saved weights
with open("model_weights.pkl", "rb") as f:
    saved_weights = pickle.load(f)

# Set the model weights to saved weights
model.load_state_dict(saved_weights)

# Use the loaded model for inference

writing translations into file (in a standartized way)