In [None]:
! nvidia-smi

Thu Apr 29 08:14:01 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# File Preprocessing




In [None]:
!pip install 'torch>=1.6.0' editdistance matplotlib sacrebleu sacremoses sentencepiece tqdm wandb
!pip install --upgrade jupyter ipywidgets
!git clone https://github.com/pytorch/fairseq.git
!cd fairseq && git checkout 9a1c497
!pip install --upgrade ./fairseq/

Collecting sacrebleu
[?25l  Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)
[K     |████████████████████████████████| 61kB 6.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 17.4MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 33.0MB/s 
Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/5c/ee/d755f9e5466df64c8416a2c6a860fb3aaa43ed6ea8e8e8e81460fda5788b/wandb-0.10.28-py2.py3-none-any.whl (2.1MB)
[K     |████████████████████

# Python Preprocessing


In [None]:
import sys
import pdb
import pprint
import logging
import os
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
import numpy as np
import tqdm.auto as tqdm
from pathlib import Path
from argparse import Namespace
from fairseq import utils
import re
import sentencepiece as spm

import matplotlib.pyplot as plt

from fairseq.models import (
    FairseqEncoder, 
    FairseqIncrementalDecoder,
    FairseqEncoderDecoderModel
)
from fairseq.models.transformer import (
    TransformerEncoder, 
    TransformerDecoder,
)
from fairseq.tasks.translation import TranslationConfig, TranslationTask


from fairseq.data import iterators
from torch.cuda.amp import GradScaler, autocast


import shutil
import sacrebleu

seed = 1126
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
np.random.seed(seed)  
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

device = "cuda" if torch.cuda.is_available() else "cpu"

# Tool Functions

In [None]:
def strQ2B(ustring):
    """把字串全形轉半形"""
    # 參考來源:https://ithelp.ithome.com.tw/articles/10233122
    ss = []
    for s in ustring:
        rstring = ""
        for uchar in s:
            inside_code = ord(uchar)
            if inside_code == 12288:  # 全形空格直接轉換
                inside_code = 32
            elif (inside_code >= 65281 and inside_code <= 65374):  # 全形字元（除空格）根據關係轉化
                inside_code -= 65248
            rstring += chr(inside_code)
        ss.append(rstring)
    return ''.join(ss)
                
def clean_s(s, lang):
    if lang == 'en':
        s = re.sub(r"\([^()]*\)", "", s) # remove ([text])
        s = s.replace('-', '') # remove '-'
        s = re.sub('([.,;!?()\"])', r' \1 ', s) # keep punctuation
    elif lang == 'zh':
        s = strQ2B(s) # Q2B
        s = re.sub(r"\([^()]*\)", "", s) # remove ([text])
        s = s.replace(' ', '')
        s = s.replace('—', '')
        s = s.replace('“', '"')
        s = s.replace('”', '"')
        s = s.replace('_', '')
        s = re.sub('([。,;!?()\"~「」])', r' \1 ', s) # keep punctuation
    s = ' '.join(s.strip().split())
    return s

def len_s(s, lang):
    if lang == 'zh':
        return len(s)
    return len(s.split())

def clean_corpus(prefix, l1, l2, ratio=9, max_len=1000, min_len=1):
    if Path(f'{prefix}.clean.{l1}').exists() and Path(f'{prefix}.clean.{l2}').exists():
        print(f'{prefix}.clean.{l1} & {l2} exists. skipping clean.')
        return
    with open(f'{prefix}.{l1}', 'r') as l1_in_f:
        with open(f'{prefix}.{l2}', 'r') as l2_in_f:
            with open(f'{prefix}.clean.{l1}', 'w') as l1_out_f:
                with open(f'{prefix}.clean.{l2}', 'w') as l2_out_f:
                    for s1 in l1_in_f:
                        s1 = s1.strip()
                        s2 = l2_in_f.readline().strip()
                        s1 = clean_s(s1, l1)
                        s2 = clean_s(s2, l2)
                        s1_len = len_s(s1, l1)
                        s2_len = len_s(s2, l2)
                        if min_len > 0: # remove short sentence
                            if s1_len < min_len or s2_len < min_len:
                                continue
                        if max_len > 0: # remove long sentence
                            if s1_len > max_len or s2_len > max_len:
                                continue
                        if ratio > 0: # remove by ratio of length
                            if s1_len/s2_len > ratio or s2_len/s1_len > ratio:
                                continue
                        print(s1, file=l1_out_f)
                        print(s2, file=l2_out_f)

def load_data_iterator(task, split, epoch=1, max_tokens=4000, num_workers=1, cached=True):
    batch_iterator = task.get_batch_iterator(
        dataset=task.dataset(split),
        max_tokens=max_tokens,
        max_sentences=None,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            max_tokens,
        ),
        ignore_invalid_inputs=True,
        seed=seed,
        num_workers=num_workers,
        epoch=epoch,
        disable_iterator_cache=not cached,
        # Set this to False to speed up. However, if set to False, changing max_tokens beyond 
        # first call of this method has no effect. 
    )
    return batch_iterator

def build_model(args, task):
    """ 按照參數設定建置模型 """
    src_dict, tgt_dict = task.source_dictionary, task.target_dictionary

    # 詞嵌入
    encoder_embed_tokens = nn.Embedding(len(src_dict), args.encoder_embed_dim, src_dict.pad())
    decoder_embed_tokens = nn.Embedding(len(tgt_dict), args.decoder_embed_dim, tgt_dict.pad())
    
    # 編碼器與解碼器
    # TODO: 替換成 TransformerEncoder 和 TransformerDecoder
    encoder = TransformerEncoder(args, src_dict, encoder_embed_tokens)
    decoder = TransformerDecoder(args, tgt_dict, decoder_embed_tokens)
    
    # 序列到序列模型
    model = Seq2Seq(args, encoder, decoder)
    
    # 序列到序列模型的初始化很重要 需要特別處理
    def init_params(module):
        from fairseq.modules import MultiheadAttention
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        if isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        if isinstance(module, MultiheadAttention):
            module.q_proj.weight.data.normal_(mean=0.0, std=0.02)
            module.k_proj.weight.data.normal_(mean=0.0, std=0.02)
            module.v_proj.weight.data.normal_(mean=0.0, std=0.02)
        if isinstance(module, nn.RNNBase):
            for name, param in module.named_parameters():
                if "weight" in name or "bias" in name:
                    param.data.uniform_(-0.1, 0.1)
            
    # 初始化模型
    model.apply(init_params)
    return model

def add_transformer_args(args):
    args.encoder_attention_heads=4
    args.encoder_normalize_before=True
    
    args.decoder_attention_heads=4
    args.decoder_normalize_before=True
    
    args.activation_fn="relu"
    args.max_source_positions=1024
    args.max_target_positions=1024
    
    # 補上我們沒有設定的Transformer預設參數
    from fairseq.models.transformer import base_architecture 
    base_architecture(arch_args)

def train_one_epoch(epoch_itr, model, task, criterion, optimizer, accum_steps=1):
    itr = epoch_itr.next_epoch_itr(shuffle=True)
    itr = iterators.GroupedIterator(itr, accum_steps) # 梯度累積: 每 accum_steps 個 sample 更新一次
    
    stats = {"loss": []}
    scaler = GradScaler() # 混和精度訓練 automatic mixed precision (amp) 
    
    model.train()
    progress = tqdm.tqdm(itr, desc=f"train epoch {epoch_itr.epoch}", leave=False)
    for samples in progress:
        model.zero_grad()
        accum_loss = 0
        sample_size = 0
        # 梯度累積: 每 accum_steps 個 sample 更新一次
        for i, sample in enumerate(samples):
            if i == 1:
                # emptying the CUDA cache after the first step can reduce the chance of OOM
                torch.cuda.empty_cache()

            sample = utils.move_to_cuda(sample, device=device)
            target = sample["target"]
            sample_size_i = sample["ntokens"]
            sample_size += sample_size_i
            
            # 混和精度訓練 
            with autocast():
                net_output = model.forward(**sample["net_input"])
                lprobs = F.log_softmax(net_output[0], -1)            
                loss = criterion(lprobs.view(-1, lprobs.size(-1)), target.view(-1))
                
                # logging
                accum_loss += loss.item()
                # back-prop
                scaler.scale(loss).backward()                
        
        scaler.unscale_(optimizer)
        optimizer.multiply_grads(1 / (sample_size or 1.0)) # (sample_size or 1.0) handles the case of a zero gradient
        gnorm = nn.utils.clip_grad_norm_(model.parameters(), config.clip_norm) # 梯度裁剪 防止梯度爆炸
        
        scaler.step(optimizer)
        scaler.update()
        
        # logging
        loss_print = accum_loss/sample_size
        stats["loss"].append(loss_print)
        progress.set_postfix(loss=loss_print)
        if config.use_wandb:
            wandb.log({
                "train/loss": loss_print,
                "train/grad_norm": gnorm.item(),
                "train/lr": optimizer.rate(),
                "train/sample_size": sample_size,
            })
        
    loss_print = np.mean(stats["loss"])
    logger.info(f"training loss: {loss_print:.4f}")
    return stats

def decode(toks, dictionary):
    # 從 Tensor 轉成人看得懂的句子
    s = dictionary.string(
        toks.int().cpu(),
        config.post_process,
    )
    return s if s else "<unk>"

def inference_step(sample, model):
    gen_out = sequence_generator.generate([model], sample)
    srcs = []
    hyps = []
    refs = []
    for i in range(len(gen_out)):
        # 對於每個 sample, 收集輸入，輸出和參考答案，稍後計算 BLEU
        srcs.append(decode(
            utils.strip_pad(sample["net_input"]["src_tokens"][i], task.source_dictionary.pad()), 
            task.source_dictionary,
        ))
        hyps.append(decode(
            gen_out[i][0]["tokens"], # 0 代表取出 beam 內分數第一的輸出結果
            task.target_dictionary,
        ))
        refs.append(decode(
            utils.strip_pad(sample["target"][i], task.target_dictionary.pad()), 
            task.target_dictionary,
        ))
    return srcs, hyps, refs

def validate(model, task, criterion, log_to_wandb=True):
    logger.info('begin validation')
    itr = load_data_iterator(task, "valid", 1, config.max_tokens, config.num_workers).next_epoch_itr(shuffle=False)
    
    stats = {"loss":[], "bleu": 0, "srcs":[], "hyps":[], "refs":[]}
    srcs = []
    hyps = []
    refs = []
    
    model.eval()
    progress = tqdm.tqdm(itr, desc=f"validation", leave=False)
    with torch.no_grad():
        for i, sample in enumerate(progress):
            # validation loss
            sample = utils.move_to_cuda(sample, device=device)
            net_output = model.forward(**sample["net_input"])

            lprobs = F.log_softmax(net_output[0], -1)
            target = sample["target"]
            sample_size = sample["ntokens"]
            loss = criterion(lprobs.view(-1, lprobs.size(-1)), target.view(-1)) / sample_size
            progress.set_postfix(valid_loss=loss.item())
            stats["loss"].append(loss)
            
            # 進行推論
            s, h, r = inference_step(sample, model)
            srcs.extend(s)
            hyps.extend(h)
            refs.extend(r)
            
    tok = 'zh' if task.cfg.target_lang == 'zh' else '13a'
    stats["loss"] = torch.stack(stats["loss"]).mean().item()
    stats["bleu"] = sacrebleu.corpus_bleu(hyps, [refs], tokenize=tok) # 計算BLEU score
    stats["srcs"] = srcs
    stats["hyps"] = hyps
    stats["refs"] = refs
    
    if config.use_wandb and log_to_wandb:
        wandb.log({
            "valid/loss": stats["loss"],
            "valid/bleu": stats["bleu"].score,
        }, commit=False)
    
    showid = np.random.randint(len(hyps))
    logger.info("example source: " + srcs[showid])
    logger.info("example hypothesis: " + hyps[showid])
    logger.info("example reference: " + refs[showid])
    
    # show bleu results
    logger.info(f"validation loss:\t{stats['loss']:.4f}")
    logger.info(stats["bleu"].format())
    return stats

def validate_and_save(model, task, criterion, optimizer, epoch, save=True):   
    stats = validate(model, task, criterion)
    bleu = stats['bleu']
    loss = stats['loss']
    if save:
        # save epoch checkpoints
        savedir = Path(config.savedir).absolute()
        savedir.mkdir(parents=True, exist_ok=True)
        
        check = {
            "model": model.state_dict(),
            "stats": {"bleu": bleu.score, "loss": loss},
            "optim": {"step": optimizer._step}
        }
        torch.save(check, savedir/f"checkpoint{epoch}.pt")
        shutil.copy(savedir/f"checkpoint{epoch}.pt", savedir/f"checkpoint_last.pt")
        logger.info(f"saved epoch checkpoint: {savedir}/checkpoint{epoch}.pt")
    
        # save epoch samples
        with open(savedir/f"samples{epoch}.{config.source_lang}-{config.target_lang}.txt", "w") as f:
            for s, h in zip(stats["srcs"], stats["hyps"]):
                f.write(f"{s}\t{h}\n")

        # get best valid bleu    
        if getattr(validate_and_save, "best_bleu", 0) < bleu.score:
            validate_and_save.best_bleu = bleu.score
            torch.save(check, savedir/f"checkpoint_best.pt")
            
        del_file = savedir / f"checkpoint{epoch - config.keep_last_epochs}.pt"
        if del_file.exists():
            del_file.unlink()
    
    return stats

def try_load_checkpoint(model, optimizer=None, name=None):
    name = name if name else "checkpoint_last.pt"
    checkpath = Path(config.savedir)/name
    if checkpath.exists():
        check = torch.load(checkpath)
        model.load_state_dict(check["model"])
        stats = check["stats"]
        step = "unknown"
        if optimizer != None:
            optimizer._step = step = check["optim"]["step"]
        logger.info(f"loaded checkpoint {checkpath}: step={step} loss={stats['loss']} bleu={stats['bleu']}")
    else:
        logger.info(f"no checkpoints found at {checkpath}!")

def generate_prediction(model, task, split="test", outfile="./prediction.txt"):    
    task.load_dataset(split=split, epoch=1)
    itr = load_data_iterator(task, split, 1, config.max_tokens, config.num_workers).next_epoch_itr(shuffle=False)
    
    idxs = []
    hyps = []

    model.eval()
    progress = tqdm.tqdm(itr, desc=f"prediction")
    with torch.no_grad():
        for i, sample in enumerate(progress):
            # validation loss
            sample = utils.move_to_cuda(sample, device=device)

            # 進行推論
            s, h, r = inference_step(sample, model)
            
            hyps.extend(h)
            idxs.extend(list(sample['id']))
            
    # 根據 preprocess 時的順序排列
    hyps = [x for _,x in sorted(zip(idxs,hyps))]
    
    with open(outfile, "w") as f:
        for h in hyps:
            f.write(h+"\n")

# Data Preprocessing

In [None]:
data_dir = './DATA/rawdata'
dataset_name = 'ted2020'
urls = (
    '"https://onedrive.live.com/download?cid=3E549F3B24B238B4&resid=3E549F3B24B238B4%214989&authkey=AGgQ-DaR8eFSl1A"', 
    '"https://onedrive.live.com/download?cid=3E549F3B24B238B4&resid=3E549F3B24B238B4%214987&authkey=AA4qP_azsicwZZM"',
# # If the above links die, use the following instead. 
#     "https://www.csie.ntu.edu.tw/~r09922057/ML2021-hw5/ted2020.tgz",
#     "https://www.csie.ntu.edu.tw/~r09922057/ML2021-hw5/test.tgz",
# # If the above links die, use the following instead. 
#     "https://mega.nz/#!vEcTCISJ!3Rw0eHTZWPpdHBTbQEqBDikDEdFPr7fI8WxaXK9yZ9U",
#     "https://mega.nz/#!zNcnGIoJ!oPJX9AvVVs11jc0SaK6vxP_lFUNTkEcK2WbxJpvjU5Y",
)
file_names = (
    'ted2020.tgz', # train & dev
    'test.tgz', # test
)
prefix = Path(data_dir).absolute() / dataset_name

prefix.mkdir(parents=True, exist_ok=True)
for u, f in zip(urls, file_names):
    path = prefix/f
    if not path.exists():
        if 'mega' in u:
            !megadl {u} --path {path}
        else:
            !wget {u} -O {path}
    if path.suffix == ".tgz":
        !tar -xvf {path} -C {prefix}
    elif path.suffix == ".zip":
        !unzip -o {path} -d {prefix}
!mv {prefix/'raw.en'} {prefix/'train_dev.raw.en'}
!mv {prefix/'raw.zh'} {prefix/'train_dev.raw.zh'}
!mv {prefix/'test.en'} {prefix/'test.raw.en'}
!mv {prefix/'test.zh'} {prefix/'test.raw.zh'}

src_lang = 'en'
tgt_lang = 'zh'

data_prefix = f'{prefix}/train_dev.raw'
test_prefix = f'{prefix}/test.raw'

# Change full to half+
clean_corpus(data_prefix, src_lang, tgt_lang)
clean_corpus(test_prefix, src_lang, tgt_lang, ratio=-1, min_len=-1, max_len=-1)

# Split Valid and Train Dataset
valid_ratio = 0.01 # 3000~4000句就夠了
train_ratio = 1 - valid_ratio
if (prefix/f'train.clean.{src_lang}').exists() \
and (prefix/f'train.clean.{tgt_lang}').exists() \
and (prefix/f'valid.clean.{src_lang}').exists() \
and (prefix/f'valid.clean.{tgt_lang}').exists():
    print(f'train/valid splits exists. skipping split.')
else:
    line_num = sum(1 for line in open(f'{data_prefix}.clean.{src_lang}'))
    labels = list(range(line_num))
    random.shuffle(labels)
    for lang in [src_lang, tgt_lang]:
        train_f = open(os.path.join(data_dir, dataset_name, f'train.clean.{lang}'), 'w')
        valid_f = open(os.path.join(data_dir, dataset_name, f'valid.clean.{lang}'), 'w')
        count = 0
        for line in open(f'{data_prefix}.clean.{lang}', 'r'):
            if labels[count]/line_num < train_ratio:
                train_f.write(line)
            else:
                valid_f.write(line)
            count += 1
        train_f.close()
        valid_f.close()

# SPM
vocab_size = 8000
if (prefix/f'spm{vocab_size}.model').exists():
    print(f'{prefix}/spm{vocab_size}.model exists. skipping spm_train.')
else:
    spm.SentencePieceTrainer.train(
        input=','.join([f'{prefix}/train.clean.{src_lang}',
                        f'{prefix}/valid.clean.{src_lang}',
                        f'{prefix}/train.clean.{tgt_lang}',
                        f'{prefix}/valid.clean.{tgt_lang}']),
        model_prefix=prefix/f'spm{vocab_size}',
        vocab_size=vocab_size,
        character_coverage=1,
        model_type='unigram', # 'bpe' 也可
        input_sentence_size=1e6,
        shuffle_input_sentence=True,
        normalization_rule_name='nmt_nfkc_cf',
    )
spm_model = spm.SentencePieceProcessor(model_file=str(prefix/f'spm{vocab_size}.model'))
in_tag = {
    'train': 'train.clean',
    'valid': 'valid.clean',
    'test': 'test.raw.clean',
}
for split in ['train', 'valid', 'test']:
    for lang in [src_lang, tgt_lang]:
        out_path = prefix/f'{split}.{lang}'
        if out_path.exists():
            print(f"{out_path} exists. skipping spm_encode.")
        else:
            with open(prefix/f'{split}.{lang}', 'w') as out_f:
                with open(prefix/f'{in_tag[split]}.{lang}', 'r') as in_f:
                    for line in in_f:
                        line = line.strip()
                        tok = spm_model.encode(line, out_type=str)
                        print(' '.join(tok), file=out_f)

# Binarize data
binpath = Path('./DATA/data-bin', dataset_name)
if binpath.exists():
    print(binpath, "exists, will not overwrite!")
else:
    !python -m fairseq_cli.preprocess \
        --source-lang {src_lang}\
        --target-lang {tgt_lang}\
        --trainpref {prefix/'train'}\
        --validpref {prefix/'valid'}\
        --testpref {prefix/'test'}\
        --destdir {binpath}\
        --joined-dictionary\
        --workers 2

raw.en
raw.zh
test.en
test.zh
2021-04-30 15:23:56 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='DATA/data-bin/ted2020', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=True, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer=None, padding_factor=8, profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en', srcdict=None, suppress_crashes=False, target_lang='zh', task='translation', tensorboard_logdir=None, testpref='/content/DATA/rawdata/ted2020/test', tgtdict=None, threshold_loss

In [None]:
config = Namespace(
    datadir = "./DATA/data-bin/ted2020",
    savedir = "./drive/MyDrive",
    source_lang = "en",
    target_lang = "zh",
    
    # cpu threads when fetching & processing data.
    num_workers=2,  
    # batch size in terms of tokens. gradient accumulation increases the effective batchsize.
    max_tokens=8192,
    accum_steps=2,
    
    # the lr s calculated from Noam lr scheduler. you can tune the maximum lr by this factor.
    lr_factor=2.,
    lr_warmup=4000,
    
    # clipping gradient norm helps alleviate gradient exploding
    clip_norm=1.0,
    
    # maximum epochs for training
    max_epoch=40,
    start_epoch=1,
    
    # beam size for beam search
    beam=5, 
    # generate sequences of maximum length ax + b, where x is the source length
    max_len_a=1.2, 
    max_len_b=10,
    # when decoding, post process sentence by removing sentencepiece symbols.
    post_process = "sentencepiece",
    
    # checkpoints
    keep_last_epochs=5,
    resume=None, #"checkpoint_last.pt", # if resume from checkpoint name (under config.savedir)
    
    # logging
    use_wandb=False,
)
arch_args = Namespace(
    encoder_embed_dim=512,
    encoder_ffn_embed_dim=2048,
    encoder_layers=6,
    decoder_embed_dim=512,
    decoder_ffn_embed_dim=2048,
    decoder_layers=6,
    share_decoder_input_output_embed=True,
    dropout=0.1,
)

In [None]:
logging.basicConfig(
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level="INFO", # "DEBUG" "WARNING" "ERROR"
    stream=sys.stdout,
)
proj = "hw5.seq2seq"
logger = logging.getLogger(proj)
if config.use_wandb:
    import wandb
    wandb.init(project=proj, name=Path(config.savedir).stem, config=config)

# Model

In [None]:
class Seq2Seq(FairseqEncoderDecoderModel):
    def __init__(self, args, encoder, decoder):
        super().__init__(encoder, decoder)
        self.args = args
    
    def forward(
        self,
        src_tokens,
        src_lengths,
        prev_output_tokens,
        return_all_hiddens: bool = True,
    ):
        """
        Run the forward pass for an encoder-decoder model.
        """
        encoder_out = self.encoder(
            src_tokens, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens
        )
        logits, extra = self.decoder(
            prev_output_tokens,
            encoder_out=encoder_out,
            src_lengths=src_lengths,
            return_all_hiddens=return_all_hiddens,
        )
        return logits, extra

## Loss: Label Smoothing Regularization
* 讓模型學習輸出較不集中的分佈，防止模型過度自信
* 有時候Ground Truth並非唯一答案，所以在算loss時，我們會保留一部份機率給正確答案以外的label
* 可以有效防止過度擬合

code [source](https://fairseq.readthedocs.io/en/latest/_modules/fairseq/criterions/label_smoothed_cross_entropy.html)

In [None]:
class LabelSmoothedCrossEntropyCriterion(nn.Module):
    def __init__(self, smoothing, ignore_index=None, reduce=True):
        super().__init__()
        self.smoothing = smoothing
        self.ignore_index = ignore_index
        self.reduce = reduce
    
    def forward(self, lprobs, target):
        if target.dim() == lprobs.dim() - 1:
            target = target.unsqueeze(-1)
        # nll: Negative log likelihood，當目標是one-hot時的cross-entropy loss. 以下同 F.nll_loss
        nll_loss = -lprobs.gather(dim=-1, index=target)
        # 將一部分正確答案的機率分配給其他label 所以當計算cross-entropy時等於把所有label的log prob加起來
        smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
        if self.ignore_index is not None:
            pad_mask = target.eq(self.ignore_index)
            nll_loss.masked_fill_(pad_mask, 0.0)
            smooth_loss.masked_fill_(pad_mask, 0.0)
        else:
            nll_loss = nll_loss.squeeze(-1)
            smooth_loss = smooth_loss.squeeze(-1)
        if self.reduce:
            nll_loss = nll_loss.sum()
            smooth_loss = smooth_loss.sum()
        # 計算cross-entropy時 加入分配給其他label的loss
        eps_i = self.smoothing / lprobs.size(-1)
        loss = (1.0 - self.smoothing) * nll_loss + eps_i * smooth_loss
        return loss

class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
    
    @property
    def param_groups(self):
        return self.optimizer.param_groups
        
    def multiply_grads(self, c):
        """Multiplies grads by a constant *c*."""                
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is not None:
                    p.grad.data.mul_(c)
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return 0 if not step else self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))

# Main

In [None]:
toload = True
if toload:
    ! cp -r /content/drive/MyDrive/ML/5/checkpoints .
    ! ls checkpoints

rnn  rnn-back  rnn-final


In [None]:
task_cfg = TranslationConfig(
    data=config.datadir,
    source_lang=config.source_lang,
    target_lang=config.target_lang,
    train_subset="train",
    required_seq_len_multiple=8,
    dataset_impl="mmap",
    upsample_primary=1,
)
task = TranslationTask.setup_task(task_cfg)
logger.info("loading data for epoch 1")
task.load_dataset(split="train", epoch=1, combine=True) # combine if you have back-translation data.
task.load_dataset(split="valid", epoch=1)
add_transformer_args(arch_args)
if config.use_wandb:
    wandb.config.update(vars(arch_args))
model = build_model(arch_args, task).to(device)
logger.info(model)
criterion = LabelSmoothedCrossEntropyCriterion(
    smoothing=0.1,
    ignore_index=task.target_dictionary.pad(),
).to(device)
optimizer = NoamOpt(
    model_size=arch_args.encoder_embed_dim, 
    factor=config.lr_factor, 
    warmup=config.lr_warmup, 
    optimizer=torch.optim.AdamW(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=0.0001))
sequence_generator = task.build_generator([model], config)


2021-04-29 08:24:03 | INFO | fairseq.tasks.translation | [en] dictionary: 7984 types
2021-04-29 08:24:04 | INFO | fairseq.tasks.translation | [zh] dictionary: 7984 types
2021-04-29 08:24:04 | INFO | hw5.seq2seq | loading data for epoch 1
2021-04-29 08:24:04 | INFO | fairseq.data.data_utils | loaded 390,041 examples from: ./DATA/data-bin/ted2020/train.en-zh.en
2021-04-29 08:24:04 | INFO | fairseq.data.data_utils | loaded 390,041 examples from: ./DATA/data-bin/ted2020/train.en-zh.zh
2021-04-29 08:24:04 | INFO | fairseq.tasks.translation | ./DATA/data-bin/ted2020 train en-zh 390041 examples
2021-04-29 08:24:04 | INFO | fairseq.data.data_utils | loaded 3,939 examples from: ./DATA/data-bin/ted2020/valid.en-zh.en
2021-04-29 08:24:04 | INFO | fairseq.data.data_utils | loaded 3,939 examples from: ./DATA/data-bin/ted2020/valid.en-zh.zh
2021-04-29 08:24:04 | INFO | fairseq.tasks.translation | ./DATA/data-bin/ted2020 valid en-zh 3939 examples
2021-04-29 08:24:12 | INFO | hw5.seq2seq | Seq2Seq(
  

In [None]:
logger.info("task: {}".format(task.__class__.__name__))
logger.info("encoder: {}".format(model.encoder.__class__.__name__))
logger.info("decoder: {}".format(model.decoder.__class__.__name__))
logger.info("criterion: {}".format(criterion.__class__.__name__))
logger.info("optimizer: {}".format(optimizer.__class__.__name__))
logger.info(
    "num. model params: {:,} (num. trained: {:,})".format(
        sum(p.numel() for p in model.parameters()),
        sum(p.numel() for p in model.parameters() if p.requires_grad),
    )
)
logger.info(f"max tokens per batch = {config.max_tokens}, accumulate steps = {config.accum_steps}")

2021-04-27 16:38:16 | INFO | hw5.seq2seq | task: TranslationTask
2021-04-27 16:38:16 | INFO | hw5.seq2seq | encoder: TransformerEncoder
2021-04-27 16:38:16 | INFO | hw5.seq2seq | decoder: TransformerDecoder
2021-04-27 16:38:16 | INFO | hw5.seq2seq | criterion: LabelSmoothedCrossEntropyCriterion
2021-04-27 16:38:16 | INFO | hw5.seq2seq | optimizer: NoamOpt
2021-04-27 16:38:16 | INFO | hw5.seq2seq | num. model params: 52,316,160 (num. trained: 52,316,160)
2021-04-27 16:38:16 | INFO | hw5.seq2seq | max tokens per batch = 8192, accumulate steps = 2


In [None]:
epoch_itr = load_data_iterator(task, "train", config.start_epoch, config.max_tokens, config.num_workers)
try_load_checkpoint(model, optimizer, name=config.resume)
while epoch_itr.next_epoch_idx <= config.max_epoch:
    # train for one epoch
    train_one_epoch(epoch_itr, model, task, criterion, optimizer, config.accum_steps)
    stats = validate_and_save(model, task, criterion, optimizer, epoch=epoch_itr.epoch)
    logger.info("end of epoch {}".format(epoch_itr.epoch))    
    epoch_itr = load_data_iterator(task, "train", epoch_itr.next_epoch_idx, config.max_tokens, config.num_workers)

NameError: ignored

# Submission 繳交檔案

In [None]:
# 把幾個 checkpoint 平均起來可以達到 ensemble 的效果
checkdir=config.savedir
!python ./fairseq/scripts/average_checkpoints.py \
--inputs {checkdir} \
--num-epoch-checkpoints 5 \
--output {checkdir}/avg_last_5_checkpoint.pt

try_load_checkpoint(model, name="avg_last_5_checkpoint.pt")
validate(model, task, criterion, log_to_wandb=False)
generate_prediction(model, task)

Namespace(checkpoint_upper_bound=None, inputs=['./drive/MyDrive'], num_epoch_checkpoints=5, num_update_checkpoints=None, output='./drive/MyDrive/avg_last_5_checkpoint.pt')
Traceback (most recent call last):
  File "./fairseq/scripts/average_checkpoints.py", line 158, in <module>
    main()
  File "./fairseq/scripts/average_checkpoints.py", line 147, in main
    upper_bound=args.checkpoint_upper_bound,
  File "./fairseq/scripts/average_checkpoints.py", line 94, in last_n_checkpoints
    "Found {} checkpoint files but need at least {}", len(entries), n
Exception: ('Found {} checkpoint files but need at least {}', 1, 5)
2021-04-30 15:36:18 | INFO | hw5.seq2seq | loaded checkpoint drive/MyDrive/avg_last_5_checkpoint.pt: step=unknown loss=2.9752097129821777 bleu=29.37687676994526
2021-04-30 15:36:18 | INFO | hw5.seq2seq | begin validation


HBox(children=(FloatProgress(value=0.0, description='validation', max=22.0, style=ProgressStyle(description_wi…

2021-04-30 15:36:53 | INFO | hw5.seq2seq | example source: this is better than people , again .
2021-04-30 15:36:53 | INFO | hw5.seq2seq | example hypothesis: 這比人還好 , 再說一次 ,
2021-04-30 15:36:53 | INFO | hw5.seq2seq | example reference: 這再次證明它比人類優秀 。
2021-04-30 15:36:53 | INFO | hw5.seq2seq | validation loss:	2.9429
2021-04-30 15:36:53 | INFO | hw5.seq2seq | BLEU = 29.88 60.9/36.9/23.7/16.1 (BP = 0.983 ratio = 0.983 hyp_len = 108857 ref_len = 110726)
2021-04-30 15:36:53 | INFO | fairseq.data.data_utils | loaded 4,000 examples from: ./DATA/data-bin/ted2020/test.en-zh.en
2021-04-30 15:36:53 | INFO | fairseq.data.data_utils | loaded 4,000 examples from: ./DATA/data-bin/ted2020/test.en-zh.zh
2021-04-30 15:36:53 | INFO | fairseq.tasks.translation | ./DATA/data-bin/ted2020 test en-zh 4000 examples


HBox(children=(FloatProgress(value=0.0, description='prediction', max=17.0, style=ProgressStyle(description_wi…




In [None]:
tosave = True
if tosave:
    ! cp -r ./checkpoints /content/drive/MyDrive/ML/5/

# Back-translation

## 訓練一個反向的翻譯模型

1. 將實驗的參數設定表中(config)的source_lang與target_lang互相交換
2. 將實驗的參數設定表中(config)的savedir更改(ex. "./checkpoints/rnn-back")
3. 訓練一個反向模型

In [None]:
print(config)
config = Namespace(**vars(config))
config.source_lang, config.target_lang = 'zh', 'en'
config.savedir = './checkpoints/rnn-back'
print(config)

Namespace(accum_steps=2, beam=5, clip_norm=1.0, datadir='./DATA/data-bin/ted2020', keep_last_epochs=5, lr_factor=2.0, lr_warmup=4000, max_epoch=40, max_len_a=1.2, max_len_b=10, max_tokens=8192, num_workers=2, post_process='sentencepiece', resume=None, savedir='./checkpoints/rnn', source_lang='en', start_epoch=1, target_lang='zh', use_wandb=False)
Namespace(accum_steps=2, beam=5, clip_norm=1.0, datadir='./DATA/data-bin/ted2020', keep_last_epochs=5, lr_factor=2.0, lr_warmup=4000, max_epoch=40, max_len_a=1.2, max_len_b=10, max_tokens=8192, num_workers=2, post_process='sentencepiece', resume=None, savedir='./checkpoints/rnn-back', source_lang='zh', start_epoch=1, target_lang='en', use_wandb=False)


In [None]:
task_cfg = TranslationConfig(
    data=config.datadir,
    source_lang=config.source_lang,
    target_lang=config.target_lang,
    train_subset="train",
    required_seq_len_multiple=8,
    dataset_impl="mmap",
    upsample_primary=1,
)
task = TranslationTask.setup_task(task_cfg)
logger.info("loading data for epoch 1")
task.load_dataset(split="train", epoch=1, combine=True) # combine if you have back-translation data.
task.load_dataset(split="valid", epoch=1)
add_transformer_args(arch_args)
if config.use_wandb:
    wandb.config.update(vars(arch_args))
model = build_model(arch_args, task).to(device)
criterion = LabelSmoothedCrossEntropyCriterion(
    smoothing=0.1,
    ignore_index=task.target_dictionary.pad(),
).to(device)
sequence_generator = task.build_generator([model], config)

optimizer = NoamOpt(
    model_size=arch_args.encoder_embed_dim, 
    factor=config.lr_factor, 
    warmup=config.lr_warmup, 
    optimizer=torch.optim.AdamW(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=0.0001))


2021-04-30 15:32:12 | INFO | fairseq.tasks.translation | [en] dictionary: 7984 types
2021-04-30 15:32:12 | INFO | fairseq.tasks.translation | [zh] dictionary: 7984 types
2021-04-30 15:32:12 | INFO | hw5.seq2seq | loading data for epoch 1
2021-04-30 15:32:12 | INFO | fairseq.data.data_utils | loaded 390,041 examples from: ./DATA/data-bin/ted2020/train.en-zh.en
2021-04-30 15:32:12 | INFO | fairseq.data.data_utils | loaded 390,041 examples from: ./DATA/data-bin/ted2020/train.en-zh.zh
2021-04-30 15:32:12 | INFO | fairseq.tasks.translation | ./DATA/data-bin/ted2020 train en-zh 390041 examples
2021-04-30 15:32:12 | INFO | fairseq.data.data_utils | loaded 3,939 examples from: ./DATA/data-bin/ted2020/valid.en-zh.en
2021-04-30 15:32:12 | INFO | fairseq.data.data_utils | loaded 3,939 examples from: ./DATA/data-bin/ted2020/valid.en-zh.zh
2021-04-30 15:32:12 | INFO | fairseq.tasks.translation | ./DATA/data-bin/ted2020 valid en-zh 3939 examples


In [None]:
epoch_itr = load_data_iterator(task, "train", config.start_epoch, config.max_tokens, config.num_workers)
try_load_checkpoint(model, optimizer, name=config.resume)
while epoch_itr.next_epoch_idx <= config.max_epoch:
    # train for one epoch
    train_one_epoch(epoch_itr, model, task, criterion, optimizer, config.accum_steps)
    stats = validate_and_save(model, task, criterion, optimizer, epoch=epoch_itr.epoch)
    logger.info("end of epoch {}".format(epoch_itr.epoch))    
    epoch_itr = load_data_iterator(task, "train", epoch_itr.next_epoch_idx, config.max_tokens, config.num_workers)

2021-04-29 08:25:09 | INFO | hw5.seq2seq | loaded checkpoint checkpoints/rnn-back/checkpoint_last.pt: step=23917 loss=2.6079976558685303 bleu=20.369404143481926


HBox(children=(FloatProgress(value=0.0, description='train epoch 1', max=809.0, style=ProgressStyle(descriptio…

2021-04-29 08:39:34 | INFO | hw5.seq2seq | training loss: 1.9948
2021-04-29 08:39:34 | INFO | hw5.seq2seq | begin validation


HBox(children=(FloatProgress(value=0.0, description='validation', max=27.0, style=ProgressStyle(description_wi…

2021-04-29 08:40:20 | INFO | hw5.seq2seq | example source: 我覺得有點可笑 , 因為我站在這舞台上把我的時間花在告訴大家一個百年前的老故事 , 關於一個軟綿綿童玩的誕生 。 但我認為泰迪熊發明的故事裡頭還有一個更重要的故事 , 一個我們的想法能讓自然大幅改變的故事 , 同樣地 , 如今在地球上我們述說的故事也大幅改變了自然 。
2021-04-29 08:40:20 | INFO | hw5.seq2seq | example hypothesis: and i'm a bit ridiculous , because i'm standing here on this stage and spending my time telling you a hundred years ago's old story of a sponge kid , but i think teddy bear has a much more important story , a much more important story , an idea , that changes the nature of the story , and , again , as we tell it on earth , stories are changing nature dramatically .
2021-04-29 08:40:20 | INFO | hw5.seq2seq | example reference: and i do feel a little ridiculous that i'm up here on this stage and i'm choosing to use my time to tell you about a 100yearold story about the invention of a squishy kid's toy , but i'd argue that the invention of the teddy bear , inside that story is a more important story , a story about how dramatically our 

HBox(children=(FloatProgress(value=0.0, description='train epoch 2', max=809.0, style=ProgressStyle(descriptio…

KeyboardInterrupt: ignored

In [None]:
!cp -r checkpoints/rnn-back /content/drive/MyDrive/ML/5/

## 利用反向模型生成額外資料

### 下載 monolingual data

In [None]:
mono_dataset_name = 'mono'
mono_prefix = Path(data_dir).absolute() / mono_dataset_name
mono_prefix.mkdir(parents=True, exist_ok=True)

urls = (
    '"https://onedrive.live.com/download?cid=3E549F3B24B238B4&resid=3E549F3B24B238B4%214986&authkey=AANUKbGfZx0kM80"',
# # If the above links die, use the following instead. 
#     "https://www.csie.ntu.edu.tw/~r09922057/ML2021-hw5/ted_zh_corpus.deduped.gz",
# # If the above links die, use the following instead. 
#     "https://mega.nz/#!vMNnDShR!4eHDxzlpzIpdpeQTD-htatU_C7QwcBTwGDaSeBqH534",
)
file_names = (
    'ted_zh_corpus.deduped.gz',
)


for u, f in zip(urls, file_names):
    path = mono_prefix/f
    if not path.exists():
        if 'mega' in u:
            !megadl {u} --path {path}
        else:
            !wget {u} -O {path}
    else:
        print(f'{f} is exist, skip downloading')
    if path.suffix == ".tgz":
        !tar -xvf {path} -C {prefix}
    elif path.suffix == ".zip":
        !unzip -o {path} -d {prefix}
    elif path.suffix == ".gz":
        !gzip -fkd {path}

--2021-04-29 08:53:59--  https://onedrive.live.com/download?cid=3E549F3B24B238B4&resid=3E549F3B24B238B4%214986&authkey=AANUKbGfZx0kM80
Resolving onedrive.live.com (onedrive.live.com)... 13.107.42.13
Connecting to onedrive.live.com (onedrive.live.com)|13.107.42.13|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://zla8og.dm.files.1drv.com/y4mR3OR6lyPSTlGW3C2T1Js3dn8jkGI4LkSuUa5vYSGCoxVVe7MwT7bvRsL3djJkIximQmD5raVYIVgP5rNVDOCp_53iMxO6IQPeI9wzL3MLazKe-mBdgxMrgco_0pwO3GWOqiSuV3GZabw-iFDdb-rblV73G6OmXxfzae3tIPGBR1MATL6qDGLaCsVSsK2KIjaYHpudz-Q4quj5ajUScgfpQ/ted_zh_corpus.deduped.gz?download&psid=1 [following]
--2021-04-29 08:53:59--  https://zla8og.dm.files.1drv.com/y4mR3OR6lyPSTlGW3C2T1Js3dn8jkGI4LkSuUa5vYSGCoxVVe7MwT7bvRsL3djJkIximQmD5raVYIVgP5rNVDOCp_53iMxO6IQPeI9wzL3MLazKe-mBdgxMrgco_0pwO3GWOqiSuV3GZabw-iFDdb-rblV73G6OmXxfzae3tIPGBR1MATL6qDGLaCsVSsK2KIjaYHpudz-Q4quj5ajUScgfpQ/ted_zh_corpus.deduped.gz?download&psid=1
Resolving zla8og.dm.files.1drv.com (

### TODO: 清理資料集

1. 將太長、太短的句子移除
2. 統一標點符號

hint: 可以使用clean_s()來協助

In [None]:
file_name = file_names[0][:-3]
file_name = f"{mono_prefix}/{file_name}"
print(file_name)

/content/DATA/rawdata/mono/ted_zh_corpus.deduped


In [None]:
if Path(f'{file_name}.clean').exists():
    print(f'{file_name}.clean exists. skipping clean.')
else:
    l1 = 'zh'
    max_len=1000; min_len=1
    with open(f'{file_name}', 'r') as l1_in_f:
        with open(f'{file_name}.clean', 'w') as l1_out_f:
            for s1 in l1_in_f:
                s1 = s1.strip()
                s1 = clean_s(s1, l1)
                s1_len = len_s(s1, l1)
                if min_len > 0: # remove short sentence
                    if s1_len < min_len:
                        continue
                if max_len > 0: # remove long sentence
                    if s1_len > max_len:
                        continue
                print(s1, file=l1_out_f)

In [None]:
! head {file_name} -n 5
! head {file_name}.clean -n 5

在 16 世紀中葉 意大利人被一種男歌手迷住了 那種男歌手的音域廣闊，包含的音高 先前是一般成年男性不可能達到的
但是，這天賦有一個很高的代價
要防止他們變聲 這些歌手在青春期前被閹割 來停止荷爾蒙的變化， 以免他們的聲線變低沉
被稱為「閹伶」，他們輕輕的、 天使般的聲音在整個歐洲很有名 直到這個殘酷的程序， 在 19 世紀被禁止
雖然阻止聲帶的成長， 可以產生一個非凡廣闊的音域 但自然發展的聲音， 已經具有極多的可能性
在16世紀中葉意大利人被一種男歌手迷住了那種男歌手的音域廣闊 , 包含的音高先前是一般成年男性不可能達到的
但是 , 這天賦有一個很高的代價
要防止他們變聲這些歌手在青春期前被閹割來停止荷爾蒙的變化 , 以免他們的聲線變低沉
被稱為 「 閹伶 」 , 他們輕輕的、天使般的聲音在整個歐洲很有名直到這個殘酷的程序 , 在19世紀被禁止
雖然阻止聲帶的成長 , 可以產生一個非凡廣闊的音域但自然發展的聲音 , 已經具有極多的可能性


### TODO: Subword Units

用反向模型的 spm model 將資料切成 subword units

hint: spm model 的路徑為 DATA/raw-data/\[dataset\]/spm\[vocab_num\].model

In [None]:
import sentencepiece as spm
vocab_size = 8000

In [None]:
spm_model = spm.SentencePieceProcessor(model_file=str(prefix/f'spm{vocab_size}.model'))
in_tag = f"{file_name}.clean"
split, lang = 'mono.tok', 'zh'
out_path = mono_prefix/f'{split}.{lang}'
if out_path.exists():
    print(f"{out_path} exists. skipping spm_encode.")
else:
    with open(mono_prefix/f'{split}.{lang}', 'w') as out_f:
        with open(mono_prefix/f'{split}.en', 'w') as pseudo_out:
            with open(in_tag, 'r') as in_f:
                for line in in_f:
                    line = line.strip()
                    tok = spm_model.encode(line, out_type=str)
                    print(' '.join(tok), file=out_f)
                    print('.', file=pseudo_out)

In [None]:
!head {mono_prefix}/{split}.en -n 5
!head {mono_prefix}/{split}.{lang} -n 5
!head {in_tag} -n 5

.
.
.
.
.
▁在 16 世紀 中 葉 意 大 利 人 被 一種 男 歌 手 迷 住 了 那種 男 歌 手 的 音 域 廣 闊 ▁, ▁ 包 含 的 音 高 先 前 是 一般 成 年 男性 不可能 達到 的
▁但是 ▁, ▁這 天 賦 有一個 很 高 的 代 價
▁要 防 止 他們 變 聲 這些 歌 手 在 青 春 期 前 被 閹 割 來 停 止 荷 爾 蒙 的 變化 ▁, ▁以 免 他們的 聲 線 變 低 沉
▁ 被 稱為 ▁「 ▁ 閹 伶 ▁」 ▁, ▁他們 輕 輕 的 、 天 使 般 的 聲音 在 整個 歐 洲 很 有 名 直 到 這個 殘 酷 的 程 序 ▁, ▁在 19 世紀 被 禁 止
▁雖然 阻 止 聲 帶 的 成長 ▁, ▁ 可以 產生 一個 非 凡 廣 闊 的 音 域 但 自然 發展 的 聲音 ▁, ▁ 已經 具有 極 多 的 可能 性
在16世紀中葉意大利人被一種男歌手迷住了那種男歌手的音域廣闊 , 包含的音高先前是一般成年男性不可能達到的
但是 , 這天賦有一個很高的代價
要防止他們變聲這些歌手在青春期前被閹割來停止荷爾蒙的變化 , 以免他們的聲線變低沉
被稱為 「 閹伶 」 , 他們輕輕的、天使般的聲音在整個歐洲很有名直到這個殘酷的程序 , 在19世紀被禁止
雖然阻止聲帶的成長 , 可以產生一個非凡廣闊的音域但自然發展的聲音 , 已經具有極多的可能性


### Binarize

使用fairseq將資料轉為binary

In [None]:
binpath = Path('./DATA/data-bin', mono_dataset_name)
src_dict_file = './DATA/data-bin/ted2020/dict.en.txt'
tgt_dict_file = src_dict_file
monopref = str(mono_prefix/"mono.tok") # whatever filepath you get after applying subword tokenization
if binpath.exists():
    print(binpath, "exists, will not overwrite!")
else:
    !python -m fairseq_cli.preprocess\
        --source-lang 'zh'\
        --target-lang 'en'\
        --trainpref {monopref}\
        --destdir {binpath}\
        --srcdict {src_dict_file}\
        --tgtdict {tgt_dict_file}\
        --workers 2

2021-04-29 08:54:43 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='DATA/data-bin/mono', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer=None, padding_factor=8, profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='zh', srcdict='./DATA/data-bin/ted2020/dict.en.txt', suppress_crashes=False, target_lang='en', task='translation', tensorboard_logdir=None, testpref=None, tgtdict='./DATA/data-bin/ted2020/dict.en.txt', threshold_lo

### TODO: 生成反向翻譯資料

將 binarized data 加入原本的資料夾中並用一個 split_name 取名

ex. ./DATA/data-bin/ted2020/\[split_name\].zh-en.\["en", "zh"\].\["bin", "idx"\]

便可以使用 generate_prediction(model, task, split="split_name")來產生翻譯資料

In [None]:
# 將 binarized data 加入原本的資料夾中並用一個 split_name 取名
# ex. ./DATA/data-bin/ted2020/\[split_name\].zh-en.\["en", "zh"\].\["bin", "idx"\]
!cp ./DATA/data-bin/mono/train.zh-en.zh.bin ./DATA/data-bin/ted2020/mono.zh-en.zh.bin
!cp ./DATA/data-bin/mono/train.zh-en.zh.idx ./DATA/data-bin/ted2020/mono.zh-en.zh.idx
!cp ./DATA/data-bin/mono/train.zh-en.en.bin ./DATA/data-bin/ted2020/mono.zh-en.en.bin
!cp ./DATA/data-bin/mono/train.zh-en.en.idx ./DATA/data-bin/ted2020/mono.zh-en.en.idx

In [None]:
# hint: 用反向模型在 split='mono' 上進行預測，生成 prediction_file
generate_prediction(model, task, split="mono", outfile="./DATA/rawdata/mono/mono.tok.en")
# generate_prediction( ... ,split=... ,outfile=... )

2021-04-29 08:55:52 | INFO | fairseq.data.data_utils | loaded 781,713 examples from: ./DATA/data-bin/ted2020/mono.zh-en.zh
2021-04-29 08:55:52 | INFO | fairseq.data.data_utils | loaded 781,713 examples from: ./DATA/data-bin/ted2020/mono.zh-en.en
2021-04-29 08:55:52 | INFO | fairseq.tasks.translation | ./DATA/data-bin/ted2020 mono zh-en 781713 examples


HBox(children=(FloatProgress(value=0.0, description='prediction', max=1726.0, style=ProgressStyle(description_…




### TODO: 產生新的dataset

1. 將翻譯出來的資料與原先的訓練資料結合
2. 使用之前的spm model切出成Subword Units
3. 重新使用fairseq將資料轉為binary

In [None]:
! ls -al DATA/rawdata/mono

total 232896
drwxr-xr-x 2 root root     4096 Apr 29 08:54 .
drwxr-xr-x 4 root root     4096 Apr 29 08:53 ..
-rw-r--r-- 1 root root 51316332 Apr 29 10:31 mono.tok.en
-rw-r--r-- 1 root root 66901109 Apr 29 08:54 mono.tok.zh
-rw-r--r-- 1 root root 49733911 Feb 14 12:26 ted_zh_corpus.deduped
-rw-r--r-- 1 root root 48803288 Apr 29 08:54 ted_zh_corpus.deduped.clean
-rw-r--r-- 1 root root 21709855 Feb 14 12:26 ted_zh_corpus.deduped.gz


In [None]:
# 合併剛剛生成的 prediction_file (.en) 以及中文 mono.zh (.zh)
# 
# hint: 在此用剛剛的 spm model 對 prediction_file 進行切斷詞
in_tag = f"{mono_prefix}/mono.predict.en"
split, lang = 'mono.tok', 'en'
out_path = mono_prefix/f'{split}.{lang}'
if out_path.exists():
    print(f"{out_path} exists. skipping spm_encode.")
else:
    with open(out_path, 'w') as out_f:
        with open(in_tag, 'r') as in_f:
            for line in in_f:
                line = line.strip()
                tok = spm_model.encode(line, out_type=str)
                print(' '.join(tok), file=out_f)
# output: ./DATA/rawdata/mono/mono.tok.en & mono.tok.zh
#
# hint: 在此用 fairseq 把這些檔案再 binarize
binpath = Path('./DATA/data-bin/synthetic')
src_dict_file = './DATA/data-bin/ted2020/dict.en.txt'
tgt_dict_file = src_dict_file
monopref = "./DATA/rawdata/mono/mono.tok" # or whatever path after applying subword tokenization, w/o the suffix (.zh/.en)
if binpath.exists():
    print(binpath, "exists, will not overwrite!")
else:
    !python -m fairseq_cli.preprocess\
        --source-lang 'zh'\
        --target-lang 'en'\
        --trainpref {monopref}\
        --destdir {binpath}\
        --srcdict {src_dict_file}\
        --tgtdict {tgt_dict_file}\
        --workers 2

/content/DATA/rawdata/mono/mono.tok.en exists. skipping spm_encode.
2021-04-29 10:31:27 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='DATA/data-bin/synthetic', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer=None, padding_factor=8, profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='zh', srcdict='./DATA/data-bin/ted2020/dict.en.txt', suppress_crashes=False, target_lang='en', task='translation', tensorboard_logdir=None, t

In [None]:
# 這裡用剛剛準備的檔案合併原先 ted2020 來生成最終 back-translation 的資料
!cp -r ./DATA/data-bin/ted2020/ ./DATA/data-bin/ted2020_with_mono/

!cp ./DATA/data-bin/synthetic/train.zh-en.zh.bin ./DATA/data-bin/ted2020_with_mono/train1.en-zh.zh.bin
!cp ./DATA/data-bin/synthetic/train.zh-en.zh.idx ./DATA/data-bin/ted2020_with_mono/train1.en-zh.zh.idx
!cp ./DATA/data-bin/synthetic/train.zh-en.en.bin ./DATA/data-bin/ted2020_with_mono/train1.en-zh.en.bin
!cp ./DATA/data-bin/synthetic/train.zh-en.en.idx ./DATA/data-bin/ted2020_with_mono/train1.en-zh.en.idx

In [None]:
! ls ./DATA/data-bin/ted2020_with_mono

dict.en.txt	   preprocess.log	train1.en-zh.en.idx  train.en-zh.zh.idx
dict.zh.txt	   test.en-zh.en.bin	train1.en-zh.zh.bin  valid.en-zh.en.bin
mono.zh-en.en.bin  test.en-zh.en.idx	train1.en-zh.zh.idx  valid.en-zh.en.idx
mono.zh-en.en.idx  test.en-zh.zh.bin	train.en-zh.en.bin   valid.en-zh.zh.bin
mono.zh-en.zh.bin  test.en-zh.zh.idx	train.en-zh.en.idx   valid.en-zh.zh.idx
mono.zh-en.zh.idx  train1.en-zh.en.bin	train.en-zh.zh.bin


### TODO: 重新訓練

當已經產生新的資料集

1. 將實驗的參數設定表(config)中的datadir改為新的資料集("./DATA/data-bin/ted2020_with_mono")
2. 將實驗的參數設定表(config)中的source_lang與target_lang設定還原("en", "zh")
3. 將實驗的參數設定表(config)中的savedir更改(ex. "./checkpoints/rnn-bt")
4. 重新訓練

In [None]:
config.datadir = "./DATA/data-bin/ted2020_with_mono"
config.savedir = "./checkpoints/rnn-final"
config.source_lang, config.target_lang = "en", "zh"
print(config)

Namespace(accum_steps=2, beam=5, clip_norm=1.0, datadir='./DATA/data-bin/ted2020_with_mono', keep_last_epochs=5, lr_factor=2.0, lr_warmup=4000, max_epoch=40, max_len_a=1.2, max_len_b=10, max_tokens=8192, num_workers=2, post_process='sentencepiece', resume=None, savedir='./checkpoints/rnn-final', source_lang='en', start_epoch=1, target_lang='zh', use_wandb=False)


In [None]:
task_cfg = TranslationConfig(
    data=config.datadir,
    source_lang=config.source_lang,
    target_lang=config.target_lang,
    train_subset="train",
    required_seq_len_multiple=8,
    dataset_impl="mmap",
    upsample_primary=1,
)
task = TranslationTask.setup_task(task_cfg)
logger.info("loading data for epoch 1")
task.load_dataset(split="train", epoch=1, combine=True) # combine if you have back-translation data.
task.load_dataset(split="valid", epoch=1)
add_transformer_args(arch_args)
if config.use_wandb:
    wandb.config.update(vars(arch_args))
model = build_model(arch_args, task).to(device)
criterion = LabelSmoothedCrossEntropyCriterion(
    smoothing=0.1,
    ignore_index=task.target_dictionary.pad(),
).to(device)
sequence_generator = task.build_generator([model], config)

optimizer = NoamOpt(
    model_size=arch_args.encoder_embed_dim, 
    factor=config.lr_factor, 
    warmup=config.lr_warmup, 
    optimizer=torch.optim.AdamW(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=0.0001))


2021-04-29 10:33:04 | INFO | fairseq.tasks.translation | [en] dictionary: 7984 types
2021-04-29 10:33:04 | INFO | fairseq.tasks.translation | [zh] dictionary: 7984 types
2021-04-29 10:33:04 | INFO | hw5.seq2seq | loading data for epoch 1
2021-04-29 10:33:04 | INFO | fairseq.data.data_utils | loaded 390,041 examples from: ./DATA/data-bin/ted2020_with_mono/train.en-zh.en
2021-04-29 10:33:04 | INFO | fairseq.data.data_utils | loaded 390,041 examples from: ./DATA/data-bin/ted2020_with_mono/train.en-zh.zh
2021-04-29 10:33:04 | INFO | fairseq.tasks.translation | ./DATA/data-bin/ted2020_with_mono train en-zh 390041 examples
2021-04-29 10:33:04 | INFO | fairseq.data.data_utils | loaded 781,713 examples from: ./DATA/data-bin/ted2020_with_mono/train1.en-zh.en
2021-04-29 10:33:04 | INFO | fairseq.data.data_utils | loaded 781,713 examples from: ./DATA/data-bin/ted2020_with_mono/train1.en-zh.zh
2021-04-29 10:33:04 | INFO | fairseq.tasks.translation | ./DATA/data-bin/ted2020_with_mono train1 en-zh 7

In [None]:
epoch_itr = load_data_iterator(task, "train", config.start_epoch, config.max_tokens, config.num_workers)
try_load_checkpoint(model, optimizer, name=config.resume)
while epoch_itr.next_epoch_idx <= config.max_epoch:
    # train for one epoch
    train_one_epoch(epoch_itr, model, task, criterion, optimizer, config.accum_steps)
    stats = validate_and_save(model, task, criterion, optimizer, epoch=epoch_itr.epoch)
    logger.info("end of epoch {}".format(epoch_itr.epoch))    
    epoch_itr = load_data_iterator(task, "train", epoch_itr.next_epoch_idx, config.max_tokens, config.num_workers)

2021-04-29 10:33:06 | INFO | hw5.seq2seq | loaded checkpoint checkpoints/rnn-final/checkpoint_last.pt: step=60108 loss=2.9752097129821777 bleu=29.37687676994526


HBox(children=(FloatProgress(value=0.0, description='train epoch 1', max=1699.0, style=ProgressStyle(descripti…

2021-04-29 11:03:07 | INFO | hw5.seq2seq | training loss: 3.4956
2021-04-29 11:03:07 | INFO | hw5.seq2seq | begin validation


HBox(children=(FloatProgress(value=0.0, description='validation', max=22.0, style=ProgressStyle(description_wi…

2021-04-29 11:03:50 | INFO | hw5.seq2seq | example source: this is better than people , again .
2021-04-29 11:03:50 | INFO | hw5.seq2seq | example hypothesis: 這還是比人好 , 再說一次 ,
2021-04-29 11:03:50 | INFO | hw5.seq2seq | example reference: 這再次證明它比人類優秀 。
2021-04-29 11:03:50 | INFO | hw5.seq2seq | validation loss:	2.9497
2021-04-29 11:03:50 | INFO | hw5.seq2seq | BLEU = 29.05 60.3/36.2/23.1/15.5 (BP = 0.978 ratio = 0.978 hyp_len = 108265 ref_len = 110726)
2021-04-29 11:03:51 | INFO | hw5.seq2seq | saved epoch checkpoint: /content/checkpoints/rnn-final/checkpoint1.pt
2021-04-29 11:03:52 | INFO | hw5.seq2seq | end of epoch 1


HBox(children=(FloatProgress(value=0.0, description='train epoch 2', max=1699.0, style=ProgressStyle(descripti…

2021-04-29 11:33:52 | INFO | hw5.seq2seq | training loss: 3.3972
2021-04-29 11:33:52 | INFO | hw5.seq2seq | begin validation


HBox(children=(FloatProgress(value=0.0, description='validation', max=22.0, style=ProgressStyle(description_wi…

2021-04-29 11:34:35 | INFO | hw5.seq2seq | example source: that's all i try to do .
2021-04-29 11:34:35 | INFO | hw5.seq2seq | example hypothesis: 這就是我要做的 。
2021-04-29 11:34:35 | INFO | hw5.seq2seq | example reference: 我試著做的就只有這樣 。
2021-04-29 11:34:35 | INFO | hw5.seq2seq | validation loss:	2.9563
2021-04-29 11:34:35 | INFO | hw5.seq2seq | BLEU = 28.97 60.3/36.1/22.8/15.3 (BP = 0.981 ratio = 0.981 hyp_len = 108631 ref_len = 110726)
2021-04-29 11:34:36 | INFO | hw5.seq2seq | saved epoch checkpoint: /content/checkpoints/rnn-final/checkpoint2.pt
2021-04-29 11:34:36 | INFO | hw5.seq2seq | end of epoch 2


HBox(children=(FloatProgress(value=0.0, description='train epoch 3', max=1699.0, style=ProgressStyle(descripti…

2021-04-29 12:04:41 | INFO | hw5.seq2seq | training loss: 3.3603
2021-04-29 12:04:41 | INFO | hw5.seq2seq | begin validation


HBox(children=(FloatProgress(value=0.0, description='validation', max=22.0, style=ProgressStyle(description_wi…

2021-04-29 12:05:26 | INFO | hw5.seq2seq | example source: and social media platforms supercharge that tendency , by allowing us to instantly and widely share information that accords with our viewpoints .
2021-04-29 12:05:26 | INFO | hw5.seq2seq | example hypothesis: 社交媒體平台超級充電 , 讓我們能即時、廣泛地分享資訊 , 運用我們的觀點 。
2021-04-29 12:05:26 | INFO | hw5.seq2seq | example reference: 社群媒體平台會強化你偏好的傾向 , 因為那些平台讓我們能夠迅速、廣泛地分享和我們觀點相同的資訊 。
2021-04-29 12:05:26 | INFO | hw5.seq2seq | validation loss:	2.9510
2021-04-29 12:05:26 | INFO | hw5.seq2seq | BLEU = 28.74 60.6/36.2/23.1/15.5 (BP = 0.966 ratio = 0.966 hyp_len = 106995 ref_len = 110726)
2021-04-29 12:05:27 | INFO | hw5.seq2seq | saved epoch checkpoint: /content/checkpoints/rnn-final/checkpoint3.pt
2021-04-29 12:05:27 | INFO | hw5.seq2seq | end of epoch 3


HBox(children=(FloatProgress(value=0.0, description='train epoch 4', max=1699.0, style=ProgressStyle(descripti…

2021-04-29 12:35:47 | INFO | hw5.seq2seq | training loss: 3.3339
2021-04-29 12:35:47 | INFO | hw5.seq2seq | begin validation


HBox(children=(FloatProgress(value=0.0, description='validation', max=22.0, style=ProgressStyle(description_wi…

2021-04-29 12:36:34 | INFO | hw5.seq2seq | example source: and the thing about john d . is that he went into this chaotic wildeast of oil industry , and he rationalized it into a vertically integrated company , a multinational .
2021-04-29 12:36:34 | INFO | hw5.seq2seq | example hypothesis: 約翰戴維的特點是 , 他進入了石油工業的混亂東方 , 他將其合成了垂直整合的公司 , 一間跨國公司 。
2021-04-29 12:36:34 | INFO | hw5.seq2seq | example reference: 和他有關的的事情是...他走進這未開發的東部石油工業 , 他合理化它垂直整合成一個公司 , 一家跨國公司 。
2021-04-29 12:36:34 | INFO | hw5.seq2seq | validation loss:	2.9556
2021-04-29 12:36:34 | INFO | hw5.seq2seq | BLEU = 28.82 59.4/35.4/22.4/14.9 (BP = 0.996 ratio = 0.996 hyp_len = 110320 ref_len = 110726)
2021-04-29 12:36:35 | INFO | hw5.seq2seq | saved epoch checkpoint: /content/checkpoints/rnn-final/checkpoint4.pt
2021-04-29 12:36:35 | INFO | hw5.seq2seq | end of epoch 4


HBox(children=(FloatProgress(value=0.0, description='train epoch 5', max=1699.0, style=ProgressStyle(descripti…

2021-04-29 13:06:54 | INFO | hw5.seq2seq | training loss: 3.3144
2021-04-29 13:06:54 | INFO | hw5.seq2seq | begin validation


HBox(children=(FloatProgress(value=0.0, description='validation', max=22.0, style=ProgressStyle(description_wi…

2021-04-29 13:07:39 | INFO | hw5.seq2seq | example source: or at the very least , we would be able to reallocate our staff time to pursuits that better fit those missions we talked about .
2021-04-29 13:07:39 | INFO | hw5.seq2seq | example hypothesis: 或者至少 , 我們能夠重新調整員工的時間 , 去追求適合我們談論的使命 。
2021-04-29 13:07:39 | INFO | hw5.seq2seq | example reference: 或 , 至少 , 我們可以把員工的時間重新分配 , 讓他們做的事能更符合我們所談的使命 。
2021-04-29 13:07:39 | INFO | hw5.seq2seq | validation loss:	2.9600
2021-04-29 13:07:39 | INFO | hw5.seq2seq | BLEU = 28.64 60.2/35.9/22.8/15.1 (BP = 0.975 ratio = 0.975 hyp_len = 107949 ref_len = 110726)
2021-04-29 13:07:40 | INFO | hw5.seq2seq | saved epoch checkpoint: /content/checkpoints/rnn-final/checkpoint5.pt
2021-04-29 13:07:40 | INFO | hw5.seq2seq | end of epoch 5


HBox(children=(FloatProgress(value=0.0, description='train epoch 6', max=1699.0, style=ProgressStyle(descripti…

2021-04-29 13:37:54 | INFO | hw5.seq2seq | training loss: 3.2957
2021-04-29 13:37:54 | INFO | hw5.seq2seq | begin validation


HBox(children=(FloatProgress(value=0.0, description='validation', max=22.0, style=ProgressStyle(description_wi…

2021-04-29 13:38:36 | INFO | hw5.seq2seq | example source: besides , my parents and my friends' parents seemed to be doing just fine driving taxis and working as janitors .
2021-04-29 13:38:36 | INFO | hw5.seq2seq | example hypothesis: 此外 , 我父母和我朋友的父母似乎只是在幫他們計程車和清潔人員的工作 。
2021-04-29 13:38:36 | INFO | hw5.seq2seq | example reference: 此外 , 我的父母和朋友們的父母只靠開計程車和當清潔工好像也過得不錯
2021-04-29 13:38:36 | INFO | hw5.seq2seq | validation loss:	2.9555
2021-04-29 13:38:36 | INFO | hw5.seq2seq | BLEU = 28.30 60.7/36.2/22.9/15.3 (BP = 0.956 ratio = 0.957 hyp_len = 105952 ref_len = 110726)
2021-04-29 13:38:37 | INFO | hw5.seq2seq | saved epoch checkpoint: /content/checkpoints/rnn-final/checkpoint6.pt
2021-04-29 13:38:37 | INFO | hw5.seq2seq | end of epoch 6


HBox(children=(FloatProgress(value=0.0, description='train epoch 7', max=1699.0, style=ProgressStyle(descripti…

2021-04-29 14:08:47 | INFO | hw5.seq2seq | training loss: 3.2815
2021-04-29 14:08:47 | INFO | hw5.seq2seq | begin validation


HBox(children=(FloatProgress(value=0.0, description='validation', max=22.0, style=ProgressStyle(description_wi…

2021-04-29 14:09:32 | INFO | hw5.seq2seq | example source: the reality of polio today is something very different .
2021-04-29 14:09:32 | INFO | hw5.seq2seq | example hypothesis: 今天的小兒麻痺症的現實是很不一樣的
2021-04-29 14:09:32 | INFO | hw5.seq2seq | example reference: 事實上今天的小兒麻痺症是非常不一樣的
2021-04-29 14:09:32 | INFO | hw5.seq2seq | validation loss:	2.9631
2021-04-29 14:09:32 | INFO | hw5.seq2seq | BLEU = 28.65 59.7/35.5/22.4/14.9 (BP = 0.987 ratio = 0.987 hyp_len = 109339 ref_len = 110726)
2021-04-29 14:09:33 | INFO | hw5.seq2seq | saved epoch checkpoint: /content/checkpoints/rnn-final/checkpoint7.pt
2021-04-29 14:09:33 | INFO | hw5.seq2seq | end of epoch 7


HBox(children=(FloatProgress(value=0.0, description='train epoch 8', max=1699.0, style=ProgressStyle(descripti…

2021-04-29 14:39:55 | INFO | hw5.seq2seq | training loss: 3.2664
2021-04-29 14:39:55 | INFO | hw5.seq2seq | begin validation


HBox(children=(FloatProgress(value=0.0, description='validation', max=22.0, style=ProgressStyle(description_wi…

2021-04-29 14:40:38 | INFO | hw5.seq2seq | example source: so picture bell sitting in the outpatient department , students all around him , patients signing up in the emergency room and being registered and being brought in .
2021-04-29 14:40:38 | INFO | hw5.seq2seq | example hypothesis: 所以 , 想像貝爾坐在醫院的病房裡 , 他身邊的學生 , 病人在急診室報名 , 註冊並被帶進來 。
2021-04-29 14:40:38 | INFO | hw5.seq2seq | example reference: 想像貝爾坐在門診部學生們包圍住他病人在急診室填寫資料然後被登記並領入
2021-04-29 14:40:38 | INFO | hw5.seq2seq | validation loss:	2.9670
2021-04-29 14:40:38 | INFO | hw5.seq2seq | BLEU = 28.23 61.0/36.5/23.0/15.4 (BP = 0.947 ratio = 0.949 hyp_len = 105028 ref_len = 110726)
2021-04-29 14:40:39 | INFO | hw5.seq2seq | saved epoch checkpoint: /content/checkpoints/rnn-final/checkpoint8.pt
2021-04-29 14:40:39 | INFO | hw5.seq2seq | end of epoch 8


HBox(children=(FloatProgress(value=0.0, description='train epoch 9', max=1699.0, style=ProgressStyle(descripti…

KeyboardInterrupt: ignored

# References

1. <a name=ott2019fairseq></a>Ott, M., Edunov, S., Baevski, A., Fan, A., Gross, S., Ng, N., ... & Auli, M. (2019, June). fairseq: A Fast, Extensible Toolkit for Sequence Modeling. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (Demonstrations) (pp. 48-53).
2. <a name=vaswani2017></a>Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017, December). Attention is all you need. In Proceedings of the 31st International Conference on Neural Information Processing Systems (pp. 6000-6010).
3. <a name=reimers-2020-multilingual-sentence-bert></a>Reimers, N., & Gurevych, I. (2020, November). Making Monolingual Sentence Embeddings Multilingual Using Knowledge Distillation. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) (pp. 4512-4525).
4. <a name=tiedemann2012parallel></a>Tiedemann, J. (2012, May). Parallel Data, Tools and Interfaces in OPUS. In Lrec (Vol. 2012, pp. 2214-2218).
5. <a name=kudo-richardson-2018-sentencepiece></a>Kudo, T., & Richardson, J. (2018, November). SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations (pp. 66-71).
6. <a name=sennrich-etal-2016-improving></a>Sennrich, R., Haddow, B., & Birch, A. (2016, August). Improving Neural Machine Translation Models with Monolingual Data. In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 86-96).
7. <a name=edunov-etal-2018-understanding></a>Edunov, S., Ott, M., Auli, M., & Grangier, D. (2018). Understanding Back-Translation at Scale. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (pp. 489-500).
8. https://github.com/ajinkyakulkarni14/TED-Multilingual-Parallel-Corpus
9. https://ithelp.ithome.com.tw/articles/10233122
10. https://nlp.seas.harvard.edu/2018/04/03/attention.html

In [None]:
!head {data_prefix+'.'+src_lang} -n 5
!head {data_prefix+'.'+tgt_lang} -n 5

!head {data_prefix+'.clean.'+src_lang} -n 5
!head {data_prefix+'.clean.'+tgt_lang} -n 5

!head {data_dir+'/'+dataset_name+'/train.'+src_lang} -n 5
!head {data_dir+'/'+dataset_name+'/train.'+tgt_lang} -n 5

sample = task.dataset("valid")[1]
pprint.pprint(sample)
pprint.pprint(
    "Source: " + \
    task.source_dictionary.string(
        sample['source'],
        config.post_process,
    )
)
pprint.pprint(
    "Target: " + \
    task.target_dictionary.string(
        sample['target'],
        config.post_process,
    )
)



demo_epoch_obj = load_data_iterator(task, "valid", epoch=1, max_tokens=20, num_workers=1, cached=False)
demo_iter = demo_epoch_obj.next_epoch_itr(shuffle=True)
sample = next(demo_iter)
sample

Thank you so much, Chris.
And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.
I have been blown away by this conference, and I want to thank all of you for the many nice comments about what I had to say the other night.
And I say that sincerely, partly because  I need that.
Put yourselves in my position.
非常謝謝你，克里斯。能有這個機會第二度踏上這個演講台
真是一大榮幸。我非常感激。
這個研討會給我留下了極為深刻的印象，我想感謝大家 對我之前演講的好評。
我是由衷的想這麼說，有部份原因是因為 —— 我真的有需要!
請你們設身處地為我想一想！
Thank you so much , Chris .
And it's truly a great honor to have the opportunity to come to this stage twice ; I'm extremely grateful .
I have been blown away by this conference , and I want to thank all of you for the many nice comments about what I had to say the other night .
And I say that sincerely , partly because I need that .
Put yourselves in my position .
非常謝謝你 , 克里斯 。 能有這個機會第二度踏上這個演講台
真是一大榮幸 。 我非常感激 。
這個研討會給我留下了極為深刻的印象 , 我想感謝大家對我之前演講的好評 。
我是由衷的想這麼說 , 有部份原因是因為我真的有需要 !
請你們設身處地為我想一想 !
▁thank ▁you ▁so ▁much

{'id': tensor([2252]),
 'net_input': {'prev_output_tokens': tensor([[   2,  193,   63,  122,   68, 2957,    4,  149,  649,   27,  387,  270,
             10,    1,    1,    1]]),
  'src_lengths': tensor([17]),
  'src_tokens': tensor([[  1,   1,   1,   1,   1,   1,   1,  11,  45, 241, 338,   5, 646,  37,
           442, 400,   7,  18,  14,   6,   8, 940,   7,   2]])},
 'nsentences': 1,
 'ntokens': 13,
 'target': tensor([[ 193,   63,  122,   68, 2957,    4,  149,  649,   27,  387,  270,   10,
             2,    1,    1,    1]])}

In [None]:
# 把幾個 checkpoint 平均起來可以達到 ensemble 的效果
checkdir=config.savedir
!python ./fairseq/scripts/average_checkpoints.py \
--inputs {checkdir} \
--num-epoch-checkpoints 5 \
--output {checkdir}/avg_last_5_checkpoint.pt

# checkpoint_last.pt : 最後一次檢驗的檔案
# checkpoint_best.pt : 檢驗 BLEU 最高的檔案
# avg_last_5_checkpoint.pt:　最5後個檔案平均
try_load_checkpoint(model, name="checkpoint30".pt")
validate(model, task, criterion, log_to_wandb=False)
None

def generate_prediction(model, task, split="test", outfile="./prediction.txt"):    
    task.load_dataset(split=split, epoch=1)
    itr = load_data_iterator(task, split, 1, config.max_tokens, config.num_workers).next_epoch_itr(shuffle=False)
    
    idxs = []
    hyps = []

    model.eval()
    progress = tqdm.tqdm(itr, desc=f"prediction")
    with torch.no_grad():
        for i, sample in enumerate(progress):
            # validation loss
            sample = utils.move_to_cuda(sample, device=device)

            # 進行推論
            s, h, r = inference_step(sample, model)
            
            hyps.extend(h)
            idxs.extend(list(sample['id']))
            
    # 根據 preprocess 時的順序排列
    hyps = [x for _,x in sorted(zip(idxs,hyps))]
    
    with open(outfile, "w") as f:
        for h in hyps:
            f.write(h+"\n")

generate_prediction(model, task)

SyntaxError: ignored