In [1]:
!pip install -q sentencepiece fastBPE sacremoses subword_nmt sacrebleu

In [2]:
import subprocess, re
import string
import pandas as pd
import numpy as np

import torch
import sacrebleu

## Dataset

In [3]:
def run_command(command):
    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
    return process.communicate()

In [4]:
run_command("gsutil rm -r product-translation-dataset")
run_command("gsutil cp -r gs://shopee-title-translation/product-translation-dataset/ .")
run_command("ls")

(b'dev_tcn_to_en_beam=10.csv\ndev_tcn_to_en_beam=5.csv\nlight-conv-zh2en\nproduct-translation-dataset\nsample_data\n',
 None)

In [5]:
df_train_tcn = pd.read_csv('product-translation-dataset/train_tcn.csv')
df_train_en = pd.read_csv('product-translation-dataset/train_en.csv')
df_test_tcn = pd.read_csv('product-translation-dataset/test_tcn.csv')
df_dev_tcn = pd.read_csv('product-translation-dataset/dev_tcn.csv')
df_dev_en = pd.read_csv('product-translation-dataset/dev_en.csv')
df_all = pd.read_csv('product-translation-dataset/all_clean.csv')

## Model

In [6]:
torch.hub.list('pytorch/fairseq')

Using cache found in /root/.cache/torch/hub/pytorch_fairseq_master


['bart.base',
 'bart.large',
 'bart.large.cnn',
 'bart.large.mnli',
 'bart.large.xsum',
 'bpe',
 'camembert',
 'camembert-base',
 'camembert-base-ccnet',
 'camembert-base-ccnet-4gb',
 'camembert-base-oscar-4gb',
 'camembert-base-wikipedia-4gb',
 'camembert-large',
 'camembert.v0',
 'conv.stories',
 'conv.stories.pretrained',
 'conv.wmt14.en-de',
 'conv.wmt14.en-fr',
 'conv.wmt17.en-de',
 'data.stories',
 'dynamicconv.glu.wmt14.en-fr',
 'dynamicconv.glu.wmt16.en-de',
 'dynamicconv.glu.wmt17.en-de',
 'dynamicconv.glu.wmt17.zh-en',
 'dynamicconv.no_glu.iwslt14.de-en',
 'dynamicconv.no_glu.wmt16.en-de',
 'lightconv.glu.wmt14.en-fr',
 'lightconv.glu.wmt16.en-de',
 'lightconv.glu.wmt17.en-de',
 'lightconv.glu.wmt17.zh-en',
 'lightconv.no_glu.iwslt14.de-en',
 'lightconv.no_glu.wmt16.en-de',
 'roberta.base',
 'roberta.large',
 'roberta.large.mnli',
 'roberta.large.wsc',
 'tokenizer',
 'transformer.wmt14.en-fr',
 'transformer.wmt16.en-de',
 'transformer.wmt18.en-de',
 'transformer.wmt19.de-en',

In [7]:
zh2en_model_candidates = ['lightconv.glu.wmt17.zh-en', 'dynamicconv.glu.wmt17.zh-en', 
                          'bart.base', 'bart.large.xsum']

In [8]:
%%capture
zh2en = torch.hub.load('pytorch/fairseq', zh2en_model_candidates[0],
                       tokenizer='moses', bpe='subword_nmt')

In [9]:
%%capture
zh2en.cuda()
zh2en.eval()  # disable dropout

## Translation

In [10]:
ACCEPTABLE_CHARS = string.ascii_letters + string.digits + string.punctuation + "【】◤◢《》"
ACCEPTABLE_CHARS

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~【】◤◢《》'

In [11]:
def cleaning_string(my_string, to_lower=False):
    # lowercase then change special char to '' 
    my_string = re.sub(r"[^{} ]+".format(ACCEPTABLE_CHARS), ' ', my_string.lower() if to_lower else my_string)
    my_string = " ".join(my_string.split()) # remove white space
    return my_string

def clean_column(df, col):
  df[col] = list(map(cleaning_string, df[col]))
  return df

def translate_dataset(sentences, beam=5, verbose=True):
  translation_output = zh2en.translate(sentences, beam=beam, verbose=verbose)
  df = pd.DataFrame({'translation_output':translation_output})
  return df

In [12]:
%%time
BEAM = 15

df_dev_tcn_to_en = translate_dataset(df_dev_tcn['text'], beam=BEAM)
df_dev_tcn_to_en.to_csv('dev_tcn_to_en_beam={}.csv'.format(BEAM), index=False)
run_command("gsutil cp dev_tcn_to_en_beam={}.csv gs://shopee-title-translation".format(BEAM))

CPU times: user 1min 39s, sys: 34.5 s, total: 2min 13s
Wall time: 2min 15s


In [None]:
%%time
BEAM = 10

df_test_tcn_to_en = translate_dataset(df_test_tcn['text'], beam=BEAM)
df_test_tcn_to_en.to_csv('test_tcn_to_en_beam={}.csv'.format(BEAM), index=False)
run_command("gsutil cp test_tcn_to_en_beam={}.csv gs://shopee-title-translation".format(BEAM))

CPU times: user 12min 34s, sys: 3min 41s, total: 16min 16s
Wall time: 16min 18s


In [None]:
%%time
BEAM = 5

df_train_tcn_to_en = translate_dataset(df_train_tcn['product_title'], beam=BEAM)
df_train_tcn_to_en.to_csv('train_tcn_to_en_beam={}.csv'.format(BEAM), index=False)
run_command("gsutil cp train_tcn_to_en_beam={}.csv gs://shopee-title-translation".format(BEAM))

KeyboardInterrupt: ignored

## Test

In [14]:
def calc_bleu(df, col='translation_output', lowercase=True):
  bleu = sacrebleu.corpus_bleu(df[col].to_numpy(),
                               [df_dev_en['translation_output']],
                               lowercase=lowercase)
  return bleu.score

In [16]:
%%time
df_dev_preds = pd.read_csv('dev_tcn_to_en_beam=15.csv')
df_dev_preds = clean_column(df_dev_preds, 'translation_output')
print(calc_bleu(df_dev_preds, lowercase=False))

9.921272696928181
CPU times: user 224 ms, sys: 4.2 ms, total: 228 ms
Wall time: 229 ms


In [None]:
dynamic_beam=5  : 12.437685373830323, 7.626504997581818
dynamic_beam=10 : 12.378759072018653, 7.564471674993053
dynamic_beam=20 : 

light_beam=5  : 17.505513948817246, 9.871669716283721
light_beam=10 : 17.50344531082184,  9.900432287028568
light_beam=15 : 17.462990312181155, 9.921272696928181

In [12]:
!gsutil ls "gs://tensor2tensor-checkpoints"

gs://tensor2tensor-checkpoints/modelrl_experiments/
gs://tensor2tensor-checkpoints/transformer_asr_180214/
gs://tensor2tensor-checkpoints/transformer_ende_test/
gs://tensor2tensor-checkpoints/transformer_ende_test_1201/
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/


In [13]:
!gsutil ls "gs://tensor2tensor-checkpoints/transformer_multi_2jan19"

gs://tensor2tensor-checkpoints/transformer_multi_2jan19/checkpoint
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/model.ckpt-639000.data-00000-of-00009
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/model.ckpt-639000.data-00001-of-00009
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/model.ckpt-639000.data-00002-of-00009
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/model.ckpt-639000.data-00003-of-00009
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/model.ckpt-639000.data-00004-of-00009
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/model.ckpt-639000.data-00005-of-00009
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/model.ckpt-639000.data-00006-of-00009
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/model.ckpt-639000.data-00007-of-00009
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/model.ckpt-639000.data-00008-of-00009
gs://tensor2tensor-checkpoints/transformer_multi_2jan19/model.ckpt-639000.index
gs://te