## 使用 fast align 過的 UM data 建立 phrase table

Step1. build phrase table: for Chinese prepositions -> only align to English prepositions

Step2. Delete translations with count < 0.3% of the word's total count

Step3. Keep only English-Chinese pair which appears in phrase tables in both translation directions

In [1]:
from collections import Counter, defaultdict
import json
import re

##### 英文介系詞限定為 GRAMMAR PATTERN 中出現過的介詞、中文介系詞限定為 annotated data 中標記過的介詞

In [2]:
postag = {'。', '，', '、', '「', '」', '！', '？', '：', '；', '–'}
prep_en = {'about', 'across', 'after', 'against', 'among', 'around', 'as', 'at', 'between', 'by', 'for', 'from', 'in', 'into', 'like', 'not', 'of',
 'off', 'on', 'onto', 'out', 'over', 'round', 'so', 'though', 'through', 'to', 'together', 'toward', 'towards', 'under', 'with'}

with open("data/annotation_prep.txt") as file:
    prep_ch = set([line.strip() for line in file]) - set(['而', '是', '使', '就'])

In [3]:
align_file = open("data/UM.align").readlines()
align_sent_file  = open("data/UM_align_file.txt").readlines()

### Step1. Build phrase table of English and Chinese

In [5]:
def build_phrase_table(align_file, align_sent_file):
    trans_dict_en = defaultdict(Counter) # trans_dict['talk']['討論'] = 5
    trans_dict_ch = defaultdict(Counter) # trans_dict['討論']['talk] = 5
    sent_dict_en = defaultdict(list) # 單句 alignment
    sent_dict_ch = defaultdict(list)
    
    for align, sent in zip(align_file, align_sent_file):
        align = align.split()
        sent_en = sent.split('|||')[0].strip()
        sent_ch = sent.split('|||')[1].strip()
        
        for word_pair in align:
            word_pair = word_pair.split('-')
            try:
                en_word = sent_en.split()[int(word_pair[0])].lower()
                ch_word = sent_ch.split()[int(word_pair[1])]
            except:
                continue
            if re.findall('[\u4e00-\u9fa5]+', ch_word) and ch_word != '了' and en_word.isalpha(): # check 中英對應
                if ch_word in prep_ch: # 中文介詞 只對應到 英文介詞
                    if en_word in prep_en:
                        sent_dict_en[en_word] += [(ch_word, int(word_pair[1]))]
                        sent_dict_ch[ch_word] += [(en_word, int(word_pair[0]))]
                else:
                    sent_dict_en[en_word] += [(ch_word, int(word_pair[1]))]
                    sent_dict_ch[ch_word] += [(en_word, int(word_pair[0]))]
        
        for en_word, ch_words in sent_dict_en.items():
            ch_words = sorted(ch_words, key=lambda k: k[1])
            ch_trans = ' '.join([ w for w, index in ch_words])
            trans_dict_en[en_word][ch_trans] += 1
        for ch_word, en_words in sent_dict_ch.items():
            en_words = sorted(en_words, key=lambda k: k[1])
            en_trans = ' '.join([ w for w, index in en_words])
            trans_dict_ch[ch_word][en_trans] += 1
            
        sent_dict_en.clear()
        sent_dict_ch.clear()
    return trans_dict_en, trans_dict_ch

trans_dict_en, trans_dict_ch = build_phrase_table(align_file, align_sent_file)

### Step2. Delete count < 0.3% total count

In [9]:
word_count_en = Counter() # word_count_en['talk'] = count
word_count_ch = Counter()

for sent in align_sent_file:
    sent_en = sent.split('|||')[0].strip()
    sent_ch = sent.split('|||')[1].strip()
    for w_e in sent_en.split():
        word_count_en[w_e] += 1
    for w_c in sent_ch.split():
        word_count_ch[w_c] += 1

In [23]:
def filt_by_count(dict_, word_count):
    trans_filt = defaultdict(Counter)
    for word in dict_:
        threshold = word_count[word]*0.003 if word_count[word]*0.003 > 3 else 3
        for trans, count in dict_[word].items():
            if count >= threshold:
                trans_filt[word][trans] = count
    return trans_filt

trans_dict_en_filt = filt_by_count(trans_dict_en, word_count_en)
trans_dict_ch_filt = filt_by_count(trans_dict_ch, word_count_ch)

### Step3. Remove translations that are not in reverse direction of the other phrase table

In [28]:
def filt_by_trans(dict_1, dict_2):
    trans_new = defaultdict(Counter)
    for word in dict_1:
        for trans, count in dict_1[word].items():
            if word in dict_2[trans]:
                trans_new[word][trans] = count
    return trans_new

trans_dict = filt_by_trans(trans_dict_en_filt, trans_dict_ch_filt)

In [30]:
trans_dict['talk'].most_common()

[('說', 931),
 ('談論', 865),
 ('說話', 683),
 ('談', 680),
 ('討論', 530),
 ('談談', 515),
 ('會談', 418),
 ('談話', 333),
 ('交談', 325),
 ('談判', 314),
 ('講', 273),
 ('談到', 268),
 ('演講', 201),
 ('講話', 165),
 ('聊天', 158),
 ('對話', 142),
 ('說服', 87),
 ('交流', 85),
 ('聊', 83),
 ('聊聊', 80),
 ('提到', 74),
 ('說到', 70),
 ('話', 60),
 ('告訴', 49),
 ('談起', 48),
 ('說說', 45)]

#### write to file

In [29]:
with open("data/alignment_table_all_final.json", 'w') as outf:
    json.dump(trans_dict, outf, ensure_ascii=False)