In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!nvidia-smi

In [None]:
!git clone https://github.com/bert-nmt/bert-nmt

In [None]:
cd bert-nmt

In [None]:
!pip install --editable .

## 1. Data Preprocessing

### 1-1. get tokenized&bped files 
=> train.ko train.th valid.ko valid.th test.ko test.th


In [None]:
!mkdir /content/bert-nmt/examples/translation/ko-th_data

In [None]:
# file upload
# test.tags.ko-th.th  train.tags.ko-th.th
# test.tags.ko-th.ko  train.tags.ko-th.ko

In [None]:
cd /content/bert-nmt

In [None]:
#! rm examples/translation/prepare-koth.sh

In [None]:
%%writefile examples/translation/prepare-koth.sh

echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git

echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
git clone https://github.com/rsennrich/subword-nmt.git

SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
LC=$SCRIPTS/tokenizer/lowercase.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
BPEROOT=subword-nmt
BPE_TOKENS=10000


if [ ! -d "$SCRIPTS" ]; then
    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
    exit
fi

src=ko
tgt=th
lang=ko-th
prep=iwslt14.tokenized.ko-th
tmp=$prep/tmp
orig=ko-th_data


mkdir -p $tmp $prep
echo $orig $tmp $prep


echo "pre-processing train data..."
for l in $src $tgt; do
    f=train.tags.$lang.$l
    tok=train.tags.$lang.tok.$l

    cat $orig/$f | \
    perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
    echo ""
done
perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175
for l in $src $tgt; do
    perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
done



echo "pre-processing test data..."
for l in $src $tgt; do
    f=test.tags.$lang.$l
    tok=test.tags.$lang.tok.$l

    cat $orig/$f | \
    perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
    echo ""
done


echo "creating train, valid, test..."
for l in $src $tgt; do
    awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.$l > $tmp/valid.$l
    awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.$l > $tmp/train.$l

    cat $orig/test.tags.$lang.$l  > $tmp/test.$l #파일 여러개를 합쳐서 하나의 큰 파일을 만듦
done


TRAIN=$tmp/train.ko-th
BPE_CODE=$prep/code
rm -f $TRAIN
for l in $src $tgt; do
    cat $tmp/train.$l >> $TRAIN  #기존에 있는 train.en-de에 train.$l 의 내용을 덧붙여준다.
done

echo "learn_bpe.py on ${TRAIN}..."
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE

for L in $src $tgt; do
    for f in train.$L valid.$L test.$L; do
        echo "apply_bpe.py to ${f}..."
        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
    done
done

Writing examples/translation/prepare-koth.sh


In [None]:
!cd examples/translation/ && sh ./prepare-koth.sh

### 1-2. get input file for BERT model
=> train.ko train.th valid.ko valid.th test.ko test.th train.bert.ko valid.bert.ko test.bert.ko


In [None]:
# cp: 파일을 이름을 바꾸어 복사
!cp examples/translation/makedataforbert.sh examples/translation/iwslt14.tokenized.ko-th/makedataforbert.sh

In [None]:
!cd examples/translation/iwslt14.tokenized.ko-th && sh ./makedataforbert.sh ko

### 1-3. preprocess data like Fairseq

In [None]:
TEXT='examples/translation/iwslt14.tokenized.ko-th'

In [None]:
!python preprocess.py --source-lang ko --target-lang th \
  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
  --destdir /content/bert-nmt/examples/translation/preprocess.ko-th  --joined-dictionary --bert-model-name bert-base-cased

## 2. Train a vanilla NMT model using Fairseq

### 2-1. train a Transformer translation model over this data


In [None]:
cd /content

In [None]:
!git clone https://github.com/pytorch/fairseq

In [None]:
cd fairseq

In [None]:
!pip install --editable ./

In [None]:
!pip install sacremoses

In [None]:
!python train.py \
    /content/bert-nmt/examples/translation/preprocess.ko-th \
    --arch transformer_iwslt_de_en --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-tokens 4096 \
    --save-dir /content/drive/My\ Drive/bert_on_collab/checkpoints_pretrain \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    | tee -a /content/drive/My\ Drive/bert_on_collab/checkpoints_pretrain/training.log

## 3. Train a BERT-fused NMT model

In [None]:
#!rm training_script.sh

In [None]:
%%writefile training_script.sh
#!/usr/bin/env bash
nvidia-smi

cd .
python3 -c "import torch; print(torch.__version__)"

src=ko
tgt=th
bedropout=0.5
ARCH=transformer_iwslt_de_en
DATAPATH=/content/bert-nmt/examples/translation/preprocess.ko-th
#SAVEDIR=/content/drive/My\ Drive/bert_on_collab/checkpoints_minitrain
mkdir /content/drive/My\ Drive/bert_on_collab/checkpoints_train
if [ ! -f /content/drive/My\ Drive/bert_on_collab/checkpoints_train/checkpoint_nmt.pt ]
then
    cp /content/drive/My\ Drive/bert_on_collab/checkpoints_pretrain/checkpoint_best.pt /content/drive/My\ Drive/bert_on_collab/checkpoints_train/checkpoint_nmt.pt
fi
if [ ! -f "/content/drive/My\ Drive/bert_on_collab/checkpoints_train/checkpoint_last.pt" ]
then
warmup="--warmup-from-nmt --reset-lr-scheduler"
else
warmup=""
fi

python train.py $DATAPATH \
-a $ARCH --optimizer adam --lr 0.0005 -s $src -t $tgt --label-smoothing 0.1 \
--dropout 0.3 --max-tokens 4000 --min-lr '1e-09' --lr-scheduler inverse_sqrt --weight-decay 0.0001 \
--criterion label_smoothed_cross_entropy --max-update 150000 --warmup-updates 4000 --warmup-init-lr '1e-07' \
--adam-betas '(0.9,0.98)' \
--max-epoch 4 \
--save-dir /content/drive/My\ Drive/bert_on_collab/checkpoints_train --share-all-embeddings $warmup \
--encoder-bert-dropout --encoder-bert-dropout-ratio $bedropout \
--bert-model-name bert-base-multilingual-cased \
 | tee -a /content/drive/My\ Drive/bert_on_collab/checkpoints_train/training.log

In [None]:
!sh ./training_script.sh

## 4. Generate

In [None]:
!python generate.py  --quiet --bert-model-name bert-base-multilingual-cased \
                    --path /content/drive/My\ Drive/bert_on_collab/checkpoints_train/checkpoint_best.pt \
                    examples/translation/iwslt14.tokenized.ko-th \
                    | tee -a /content/drive/My\ Drive/bert_on_collab/checkpoints_train/generate.log

## 5. Interactive

In [None]:
input_lst = []

In [None]:
output_file_path='/content/bert-nmt/test_input.ko'

In [None]:
with open(output_file_path,"w+", encoding='utf-8') as f:
  for s in input_lst:
    f.write(s+'\n')
  f.close()

In [None]:
%%writefile interactive_script.sh

MOSE=/content/bert-nmt/examples/translation/mosesdecoder
bpefile=test_input.ko
src=ko
tgt=th
DATAPATH=/content/bert-nmt/examples/translation/preprocess.ko-th

sed -r 's/(@@ )|(@@ ?$)//g' $bpefile > $bpefile.debpe
$MOSE/scripts/tokenizer/detokenizer.perl -l $src < $bpefile.debpe > $bpefile.debpe.detok
paste -d "\n" $bpefile $bpefile.debpe.detok > $bpefile.in
cat $bpefile.in | python interactive.py \
$DATAPATH \
-s $src -t $tgt \
--path /content/drive/My\ Drive/bert_on_collab/checkpoints_train/checkpoint_best.pt \
--buffer-size 1024 --batch-size 128 --beam 5 --remove-bpe  
> output.log

In [None]:
!sh ./interactive_script.sh