-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess-tuning.sh
executable file
·67 lines (53 loc) · 2.54 KB
/
preprocess-tuning.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/bin/sh
# suffix of source language files
SRC=en
SRCTAG=en
# suffix of target language files
TRG=es
TRGTAG=es
tools=tools
# path to moses decoder: https://github.com/moses-smt/mosesdecoder
mosesdecoder=$tools/moses
# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
subword_nmt=$tools/subword-nmt
# tokenize
for prefix in train dev
do
cat tuning/$prefix.$SRCTAG | \
$mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC | \
$tools/normalise-romanian.py | \
$tools/remove-diacritics.py | \
$tools/tokenizer.perl -a -l $SRC > tuning/$prefix.tok.$SRCTAG
cat tuning/$prefix.$TRGTAG | \
$mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG | \
$tools/tokenizer.perl -a -l $TRG > tuning/$prefix.tok.$TRGTAG
done
# clean empty and long sentences, and sentences with high source-target ratio (training corpus only)
$mosesdecoder/scripts/training/clean-corpus-n.perl tuning/train.tok $SRCTAG $TRGTAG tuning/train.tok.clean 1 80
# train truecaser
#$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/train.tok.clean.$SRCTAG -model model/truecase-model.$SRCTAG
#$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/train.tok.clean.$TRGTAG -model model/truecase-model.$TRGTAG
# apply truecaser (cleaned training corpus)
for prefix in train
do
$tools/scripts/truecase.perl -model model/truecase-model.$SRCTAG < tuning/$prefix.tok.clean.$SRCTAG > tuning/$prefix.tc.$SRCTAG
$tools/scripts/truecase.perl -model model/truecase-model.$TRGTAG < tuning/$prefix.tok.clean.$TRGTAG > tuning/$prefix.tc.$TRGTAG
done
# apply truecaser (dev/test files)
for prefix in dev
do
$tools/scripts/truecase.perl -model model/truecase-model.$SRCTAG < tuning/$prefix.tok.$SRCTAG > tuning/$prefix.tc.$SRCTAG
$tools/scripts/truecase.perl -model model/truecase-model.$TRGTAG < tuning/$prefix.tok.$TRGTAG > tuning/$prefix.tc.$TRGTAG
done
# train BPE 源语言和目标语言分别单独训练 bpe
#cat data/train.tc.$SRCTAG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRCTAG.bpe
#cat data/train.tc.$TRGTAG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$TRGTAG.bpe
# apply BPE
for prefix in train dev
do
$tools/scripts/apply_bpe.py -c model/$SRCTAG.bpe < tuning/$prefix.tc.$SRCTAG > tuning/$prefix.bpe.$SRCTAG
$tools/scripts/apply_bpe.py -c model/$TRGTAG.bpe < tuning/$prefix.tc.$TRGTAG > tuning/$prefix.bpe.$TRGTAG
done
###############################################
# build network dictionary
#$nematus/data/build_dictionary.py data/train.bpe.$SRCTAG data/train.bpe.$TRGTAG