In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd /content/drive/MyDrive/NMT/

/content/drive/MyDrive/NMT


In [20]:
%ls

'=1.3.0'   data.txt   log   vocab.py


In [9]:
%%capture
!pip3 install torch>=1.3.0
!pip3 install subword-nmt &> log

In [10]:
from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib inline

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

# BPE Tokenizing

In [22]:
# Tokenizing & applying BPE rules

tokenizer = WordPunctTokenizer()
def tokenize(x):
    return ' '.join(tokenizer.tokenize(x.lower()))

# split and tokenize the data
with open('train.en', 'w') as f_src,  open('train.ru', 'w') as f_dst:
    for line in open('data.txt', 'r'):
        src_line, dst_line = line.strip().split('\t')
        f_src.write(tokenize(src_line) + '\n')
        f_dst.write(tokenize(dst_line) + '\n')


# build and apply bpe vocs
bpe = {}
for lang in ['en', 'ru']:
    learn_bpe(open('./train.' + lang), open('bpe_rules.' + lang, 'w'), num_symbols=8000)  # 1. learn_bpe rules
    bpe[lang] = BPE(open('./bpe_rules.' + lang))  # 2. create instance of BPE class
    
    with open('train.bpe.' + lang, 'w') as f_out:
        for line in open('train.' + lang):
            f_out.write(bpe[lang].process_line(line.strip()) + '\n')  # 3. apply BPE tokenization to our data        


# Building vocabularies

data_inp = np.array(open('./train.bpe.ru').read().split('\n'))
data_out = np.array(open('./train.bpe.en').read().split('\n'))

train_inp, dev_inp, train_out, dev_out = train_test_split(data_inp, data_out, test_size=3000,
                                                          random_state=42)
for i in range(3):
    print('inp:', train_inp[i])
    print('out:', train_out[i], end='\n\n')            

from vocab import Vocab

inp_voc = Vocab.from_lines(train_inp) # creates an instance of Vocab class from input lines (ru (input) vocab here)
out_voc = Vocab.from_lines(train_out) # en (output) vocab

print(f'Length of input (Russian) BPE vocabulary = {len(inp_voc)}')
print(f'Length of output (English) BPE vocabulary = {len(out_voc)}')

100%|██████████| 8000/8000 [00:18<00:00, 421.59it/s]
100%|██████████| 8000/8000 [00:20<00:00, 394.42it/s]


inp: на территории обустроена бесплатная частная парковка .
out: free private parking is available on site .

inp: кроме того , в 5 минутах ходьбы работают многочисленные бары и рестораны .
out: guests can find many bars and restaurants within a 5 - minute walk .

inp: отель san mi@@ gu@@ el расположен в центре мор@@ ели@@ и , в 750 метрах от главной площади города и кафедрального собора .
out: hotel san miguel is located in central more@@ lia , 750 metres from the city ’ s main square and cathedral .

Length of input (Russian) BPE vocabulary = 8048
Length of output (English) BPE vocabulary = 7801


In [23]:
!pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece] wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.0-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.1

In [None]:
pip install wandb --upgrade

In [None]:
import wandb

In [24]:
import transformers

from transformers import AutoTokenizer
    
model_checkpoint = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]



# Data Prep

In [35]:
from datasets import Dataset, DatasetDict
from datasets import load_dataset, load_metric

def parse_ruencorp(train_inp=train_inp, dev_inp=dev_inp, train_out=train_out, dev_out=dev_out, mode='train'):
    if mode == 'train':
        for src_line, dst_line in zip(train_inp, train_out):
            yield {"translation": {"ru": src_line, "en": dst_line}}
    elif mode == 'test':
        for src_line, dst_line in zip(dev_inp, dev_out):
            yield {"translation": {"ru": src_line, "en": dst_line}}

In [36]:
train_df = pd.DataFrame(parse_ruencorp(mode='train'))
test_df = pd.DataFrame(parse_ruencorp(mode='test'))

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

raw_datasets = DatasetDict({'train': train_dataset, 
                            'test': test_dataset})

In [37]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 47001
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
})

In [38]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "ru"
target_lang = "en"
def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [39]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/47001 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [44]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 47001
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [28]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to('cpu') # 'cuda' 

In [31]:
import pickle

# Get the model weights as a dictionary
weights = model.state_dict()

# Save the weights as a pickle file
with open("model_weights_no_fine_tuning.pkl", "wb") as f:
    pickle.dump(weights, f)

In [41]:
%ls

'=1.3.0'        data.txt       train.bpe.en   train.ru
 bpe_rules.en   log            train.bpe.ru   vocab.py
 bpe_rules.ru   [0m[01;34m__pycache__[0m/   train.en


In [42]:
!du -h model_weights.pkl

du: cannot access 'model_weights.pkl': No such file or directory


In [None]:
model.parameters

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    logging_strategy="steps",
    logging_steps=150,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,
    predict_with_generate=True,
    report_to="wandb",
    
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from datasets import load_metric
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Inference & BLEU computing without fine-tuning

In [51]:
type(dev_inp)

numpy.ndarray

In [56]:
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu

def compute_blue_transformer(model, inp_lines, out_lines, bpe_sep='@@ '):
    inp = [line.replace(bpe_sep, '') for line in inp_lines]
    out = [line.replace(bpe_sep, '') for line in out_lines]
    translations = []
    for src in tqdm(inp):
        input_ids = tokenizer.encode(src, return_tensors="pt").to('cpu')
        outputs = model.generate(input_ids, num_beams=2)
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        decoded = tokenize(decoded)
        translations.append(decoded)
        
    return corpus_bleu(
        [[ref.split()] for ref in out],
        [trans.split() for trans in translations],
        smoothing_function=lambda precisions, **kw: [p + 1.0 / p.denominator for p in precisions]
    ) * 100

for inp_line in dev_inp[::700]:
    inp = inp_line.replace('@@ ', '')
    input_ids = tokenizer.encode(inp, return_tensors="pt").to('cpu')
    outputs = model.generate(input_ids)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    decoded = tokenize(decoded).replace('▁', '')
    print(inp)
    print(decoded)
    print()

в распоряжении гостей общая кухня и общая гостиная .
a  comm on k it ch en and a  comm on l iv ing room a re a va il ab le to g u est s .

в окружающей местности можно заняться разными видами активного отдыха , отправиться на дайвинг , в пеший поход или бесплатно покататься на каяке .
in the s ur ro un d ing a re a ,  y ou c an  en g age in v ar io us for ms of  le is ur e  act iv ity , d iv ing , w al k ing  or k ay ak f re e of c har ge .

за 5 минут можно доехать до городка менаджо .
 y ou c an g et to  men a j o in f ive  min ut es .

в числе удобств комплекса бунгало klong jark — экскурсионное бюро . для гостей организуют такие мероприятия , как дайвинг , сноркелинг и рыбная ловля .
 am ong the f ac il it ies of the b ung al ow k l ong j ar k  com p lex  is a g u ide d to ur h ou se . the g u est s a re  gi ven  su ch e ven ts  as d iv ing , s n or k el ing and f ish ing .

в каждом из них предоставляется бесплатный wifi .
e a ch of the m h as a f re e w i fi .



In [58]:
def translate(src_text):
    translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True).to('cpu'))
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated] 

In [67]:
references = []
predictions = []

for inp_line, out_line in zip(dev_inp[::100], dev_out[::100]):
    translated_inp = translate(inp_line)
    inp = inp_line.replace('@@ ', '')
    trans = translated_inp[0].replace('@@ ', '').replace('@@', '').replace('@', '')
    predictions.append(trans)

    out = out_line.replace('@@ ', '')
    references.append(out)
    
    print(f'input line: {inp}')
    print(f'translated line: {trans}')
    print(f'target line: {out}')
    print()



input line: в распоряжении гостей общая кухня и общая гостиная .
translated line: A common kitchen and a common living room are available to guests.
target line: a shared equipped kitchen and a common living room are provided to guests .

input line: гости могут посещать сезонный открытый бассейн .
translated line: The guests can visit the open-season pool.
target line: other facilities at pearl one include a seasonal outdoor pool .

input line: wifi и парковка предоставляются бесплатно .
translated line: Wifi and parking are free of charge.
target line: wi - fi and parking are free .

input line: в окрестностях можно заняться различными видами деятельности , включая пешие прогулки , велоспорт и походы .
translated line: In the vicinity, you can engage in various types of activitytelres, including walking, cycling and hiking.
target line: several activities can be enjoyed in the area , such as hiking , cycling and walking .

input line: в номерах имеется телевизор с плоским экраном и с

KeyboardInterrupt: ignored

In [64]:
references

['a shared equipped kitchen and a common living room are provided to guests .',
 'activities such as diving and hiking can be enjoyed in the surrounding area , and there is a kayak guests can use for free .',
 'menaggio is a 5 - minute drive away .',
 'klong jark bungalow has a tour desk and arrangements can be made for diving , snorkelling and fishing .',
 'all rooms have free wi - fi .']

In [65]:
predictions

['A common kitchen and a common living room are available to guests.',
 'In the surrounding area, you can take oneother forms of active rest, diving, hiking, or free kayake.',
 'In 5 minutes, you can reach the town of manjo.',
 'Among the facilities of the bungalow complex, klong jark — guided tour service. Hosts are organized such events as diving, snorkeling and fishing.',
 'Each of them has a free wifi.']

In [66]:
import sacrebleu

bleu_score = sacrebleu.corpus_bleu(predictions, [references])
print("BLEU score: ", bleu_score.score)

BLEU score:  16.31869662242887


In [None]:
references = [example["translation"] for example in test_dataset]

In [50]:
import sacrebleu

# Generate translations for the test dataset
predictions = []
for example in raw_datasets['test']:
    input_ids = tokenizer(example["translation"], return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    predictions.append(tokenizer.decode(outputs[0]))

# Compute BLEU scores for the generated translations
references = [example["translation"] for example in test_dataset]
bleu_score = sacrebleu.corpus_bleu(predictions, [references])
print("BLEU score: ", bleu_score.score)


ValueError: ignored

In [46]:
tokenized_datasets['test']

Dataset({
    features: ['translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3000
})