In [1]:
import subprocess

def run_command(command):
    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
    return process.communicate()

## Install libraries

In [2]:
!pip install -q fairseq==0.7.1

### Install Apex

In [None]:
!rm -r apex

In [None]:
!git clone https://github.com/NVIDIA/apex.git

In [None]:
cd apex

In [None]:
%%time
!pip3 install -v -q --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

In [None]:
cd ../

### Install MASS

In [39]:
!git clone https://github.com/leloykun/MASS.git

Cloning into 'MASS'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 1221 (delta 25), reused 44 (delta 13), pack-reused 1156[K
Receiving objects: 100% (1221/1221), 419.91 MiB | 29.40 MiB/s, done.
Resolving deltas: 100% (538/538), done.
Checking out files: 100% (185/185), done.


In [40]:
cd MASS/MASS-supNMT

/kaggle/working/MASS/MASS-supNMT


In [41]:
ls

README.md              [0m[01;34mdata[0m/                  [01;34mmass[0m/                translate.sh
archi_mass_sup.png     ft_mass_enzh.sh        run_mass_enzh.sh
archi_mass_sup_md.png  generate_enzh_data.sh  run_mass_enzh_pc.sh


In [49]:
!git pull

remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 5 (delta 3), reused 5 (delta 3), pack-reused 0[K
Unpacking objects: 100% (5/5), done.
From https://github.com/leloykun/MASS
   e9f6b75..aa4e00d  master     -> origin/master
Updating e9f6b75..aa4e00d
Fast-forward
 title-translation/preprocess_dataset.sh               |  18 [32m+++++++++[m[31m---------[m
 .../processed/{test.zh-en.en => test.en-zh.en}        |   0
 .../{test.zh-en.en.bin => test.en-zh.en.bin}          | Bin
 .../{test.zh-en.en.idx => test.en-zh.en.idx}          | Bin
 .../{test.zh-en.en.pth => test.en-zh.en.pth}          | Bin
 .../processed/{test.zh-en.zh => test.en-zh.zh}        |   0
 .../{test.zh-en.zh.bin => test.en-zh.zh.bin}          | Bin
 .../{test.zh-en.zh.idx => test.en-zh.zh.idx}          | Bin
 .../{test.zh-en.zh.pth => test.en-zh.zh.pth}          | Bin
 9 files changed, 9 insertions(+), 9 deletions(-)
 

In [50]:
ls ../title-translation/processed

dict.en.txt    test.en-zh.en.bin  test.en-zh.zh      test.en-zh.zh.pth
dict.zh.txt    test.en-zh.en.idx  test.en-zh.zh.bin
test.en-zh.en  test.en-zh.en.pth  test.en-zh.zh.idx


### Load models

In [52]:
!mkdir models

In [53]:
!gsutil cp -r gs://shopee-title-translation/mass/models/checkpoint_kaggle_pc.pt models/checkpoint_best.pt

Copying gs://shopee-title-translation/mass/models/checkpoint_kaggle_sup_ft.pt...
/ [1 files][  1.1 GiB/  1.1 GiB]   89.4 MiB/s                                   
Operation completed over 1 objects/1.1 GiB.                                      


In [54]:
ls models

checkpoint_best.pt


## Train MASS

### Pretraining

In [None]:
data_dir = "data/processed"
user_dir = "mass"
save_dir = "models"
model = "checkpoint_best.pt"

seed=0
max_tokens=2048 # for 16GB GPUs
update_freq=1
dropout=0.1
attention_heads=4
embed_dim=512
ffn_embed_dim=1024
encoder_layers=6
decoder_layers=4
word_mask=0.3

start_epoch = 0
for epoch in range(start_epoch, 10+1):
    !echo "start epoch $epoch" && \
    fairseq-train $data_dir \
        --user-dir $user_dir \
        --task xmasked_seq2seq \
        --source-langs en,zh \
        --target-langs en,zh \
        --langs en,zh \
        --arch xtransformer \
        --mass_steps en-en,zh-zh \
        --memt_steps en-zh,zh-en \
        --save-dir $save_dir \
        --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
        --lr-scheduler inverse_sqrt --lr 0.00005 --min-lr 1e-09 \
        --criterion label_smoothed_cross_entropy \
        --lm-bias --lazy-load --seed $seed \
        --log-format json \
        --max-tokens $max_tokens --update-freq $update_freq \
        --encoder-normalize-before  --decoder-normalize-before \
        --dropout $dropout --relu-dropout $dropout --attention-dropout $dropout \
        --decoder-attention-heads $attention_heads --encoder-attention-heads $attention_heads \
        --decoder-embed-dim $embed_dim --encoder-embed-dim $embed_dim \
        --decoder-ffn-embed-dim $ffn_embed_dim --encoder-ffn-embed-dim $ffn_embed_dim \
        --encoder-layers $encoder_layers --decoder-layers $decoder_layers \
        --max-update 100000000 --max-epoch $epoch \
        --keep-last-epochs 1 --log-interval 100 \
        --share-decoder-input-output-embed \
        --valid-lang-pairs en-zh \
        --word_mask $word_mask \
        --ddp-backend=no_c10d \
        --restore-file $model \
        --skip-invalid-size-inputs-valid-test && \
    gsutil cp models/checkpoint_best.pt gs://shopee-title-translation/mass/models/checkpoint_kaggle_pc.pt && \
    rm models/*.pt && \
    gsutil cp gs://shopee-title-translation/mass/models/checkpoint_kaggle_pc.pt models/checkpoint_best.pt

### Fine-tuning

In [None]:
data_dir = "data/processed"
user_dir = "mass"
save_dir = "models"
model = "checkpoint_best.pt"

seed=0
max_tokens=2048 # for 16GB GPUs
update_freq=1
dropout=0.1
attention_heads=4
embed_dim=512
ffn_embed_dim=1024
encoder_layers=6
decoder_layers=4
word_mask=0.3

start_epoch = 8
for epoch in range(start_epoch, 11):
    !echo "start epoch $epoch" && \
    fairseq-train $data_dir \
        --user-dir $user_dir \
        --task xmasked_seq2seq \
        --source-langs en,zh \
        --target-langs en,zh \
        --langs en,zh \
        --arch xtransformer \
        --mt_steps zh-en \
        --save-dir $save_dir \
        --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
        --lr-scheduler inverse_sqrt --lr 0.00005 --min-lr 1e-09 \
        --criterion label_smoothed_cross_entropy \
        --lm-bias --lazy-load --seed $seed \
        --log-format json \
        --max-tokens $max_tokens --update-freq $update_freq \
        --encoder-normalize-before  --decoder-normalize-before \
        --dropout $dropout --relu-dropout $dropout --attention-dropout $dropout \
        --decoder-attention-heads $attention_heads --encoder-attention-heads $attention_heads \
        --decoder-embed-dim $embed_dim --encoder-embed-dim $embed_dim \
        --decoder-ffn-embed-dim $ffn_embed_dim --encoder-ffn-embed-dim $ffn_embed_dim \
        --encoder-layers $encoder_layers --decoder-layers $decoder_layers \
        --max-update 100000000 --max-epoch $epoch \
        --keep-last-epochs 1 --log-interval 100 \
        --share-decoder-input-output-embed \
        --valid-lang-pairs en-zh \
        --word_mask $word_mask \
        --ddp-backend=no_c10d \
        --restore-file $model \
        --skip-invalid-size-inputs-valid-test && \
    gsutil cp models/checkpoint_best.pt gs://shopee-title-translation/mass/models/checkpoint_kaggle_pc.pt && \
    rm models/*.pt && \
    gsutil cp gs://shopee-title-translation/mass/models/checkpoint_kaggle_pc.pt models/checkpoint_best.pt

### Inference

In [11]:
!mkdir data/results

In [31]:
ls ../title-translation/processed

dict.en.txt        test.en-zh.zh.bin  test.zh-en.en.idx  test.zh-en.zh.idx
dict.zh.txt        test.en-zh.zh.idx  test.zh-en.en.pth  test.zh-en.zh.pth
test.en-zh.en.bin  test.zh-en.en      test.zh-en.zh
test.en-zh.en.idx  test.zh-en.en.bin  test.zh-en.zh.bin


In [90]:
model="models/checkpoint_best.pt"
data_dir="../title-translation/processed"
user_dir="mass"
results_dir="data/results"

!fairseq-generate $data_dir \
    --user-dir $user_dir \
    -s zh -t en \
    --langs zh,en \
    --source-langs zh --target-langs en \
    --mt_steps zh-en \
    --gen-subset test \
    --task xmasked_seq2seq \
    --path $model \
    --beam 8 --remove-bpe  \
    --results-path $results_dir | tee preds.out

Namespace(beam=8, cpu=False, criterion='cross_entropy', data='../title-translation/processed', dataset_impl='cached', diverse_beam_groups=-1, diverse_beam_strength=0.5, force_anneal=None, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', langs='zh,en', lazy_load=False, left_pad_source='True', left_pad_target='False', lenpen=1, lm_bias=False, log_format=None, log_interval=1000, lr_scheduler='fixed', lr_shrink=0.1, mass_steps='', match_source_len=False, max_len_a=0, max_len_b=200, max_sentences=None, max_source_positions=1024, max_target_positions=1024, max_tokens=12000, memory_efficient_fp16=False, memt_steps='', min_len=1, min_loss_scale=0.0001, model_overrides='{}', momentum=0.99, mt_steps='zh-en', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, optimizer='nag', path='models/checkpoint_best.pt', prefix_size=0, print_alignment=False, quiet=False, raw_text=

### Postprocess output

In [91]:
import pandas as pd
import numpy as np
import io

In [93]:
f = io.open("preds.out", mode="r", encoding="utf-8")
data = f.readlines()

In [94]:
sentences = []
for line in data:
    if line[0] == 'H':
        l, p, s = line.split('\t')
        l = int(l[2:])
        s = s.strip()
        sentences.append([l, s])

In [96]:
sentences.sort()
sentences = np.array(sentences)

In [98]:
df_submission = pd.DataFrame({'translation_output':sentences[:,1]})
df_submission.head()

Unnamed: 0,translation_output
0,[ polarstar ] beautiful wool warm socks & quot...
1,sweet crystal ~ natural crystal five elements ...
2,pink crystal hexagonal sterling silver necklace
3,3m scotch vhb super strong double-sided tape-o...
4,exclusive lamp offer * 4 boxes


In [99]:
df_submission.to_csv('submission.csv', index=False)

In [None]:
!gsutil cp submission.csv gs://shopee-title-translation/mass/submission_pc.csv