# Library

In [1]:
import os
import re
import random
import itertools
import platform
import multiprocessing
import multiprocessing.dummy
import logging
import inspect
from typing import List
from datetime import datetime

import jieba
import nltk
import faiss
import numpy as np
import pandas as pd
import sacrebleu
import regex
from sacrebleu import corpus_bleu


In [2]:
print('Python version:', platform.python_version())
print('Jieba version', jieba.__version__)
print('NLTK version:', nltk.__version__)
print('Numpy version:', np.__version__)
print('Pandas version:', pd.__version__)
print('Sacrebleu version:', sacrebleu.__version__)
print('Regex version:', regex.__version__)


Python version: 3.8.3
Jieba version 0.42.1
NLTK version: 3.5
Numpy version: 1.19.0
Pandas version: 1.0.5
Sacrebleu version: 1.4.12
Regex version: 2.5.64


In [3]:
if hasattr(faiss, 'StandardGpuResources'):
    print('faiss-gpu detected!')
else:
    print('faiss-cpu detected!')


faiss-gpu detected!


In [4]:
SEED = 42

os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)


In [5]:
OTHERS_PATTERN: re.Pattern = regex.compile(r'\p{So}')

START_DATETIME = str(datetime.now())
logging.basicConfig(filename=f'log_{START_DATETIME}.log',level=logging.DEBUG)

DIMENSION = 300
K = 5
RES = faiss.StandardGpuResources()

CO = faiss.GpuClonerOptions()
CO.useFloat16 = False
CO.usePrecomputed = False
CO.indicesOptions = faiss.INDICES_CPU


# Vector

 Perform nearest neighbors precomputation with all source tokens, so translation speed would be very fast

In [6]:
def load_fasttext_embedding(filepath):
    word_index = 0
    word2id = {}
    id2word = {}
    embedding = []

    with open(filepath, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        for i, line in enumerate(f):
            if i == 0:
                header = line.split()
                assert len(header) == 2
                assert DIMENSION == int(header[1])
                embedding = np.empty(
                    (int(header[0]), int(header[1])),
                    dtype=np.float32
                )
            else:
                word, word_vector = line.rstrip().split(' ', 1)
                word_vector = np.fromstring(word_vector, sep=' ')

                word2id[word] = word_index
                id2word[word_index] = word
                embedding[i-1] = word_vector[None]
                word_index += 1

    return {
        'word2id'   : word2id,
        'id2word'   : id2word,
        'embedding' : embedding
    }


In [7]:
vec_ours_en = load_fasttext_embedding('./data/muse/debug/whw5r77gaz/vectors-en.txt')
vec_ours_zh = load_fasttext_embedding('./data/muse/debug/whw5r77gaz/vectors-zh.txt')

vec_wiki_en = load_fasttext_embedding('./data/wikivec/wiki.en.align.vec')
vec_wiki_zh = load_fasttext_embedding('./data/wikivec/wiki.zh.align.vec')


In [8]:
def create_index(distance, src, tgt):
    if distance == 'IP':
        index = faiss.IndexFlatIP(DIMENSION)
    elif distance == 'L2':
        index = faiss.IndexFlatL2(DIMENSION)
    else:
        index = faiss.IndexFlatIP(DIMENSION)

    index.add(tgt)
    index = faiss.index_cpu_to_gpu(RES, 0, index)

    src2trg_dist, src2trg_id = index.search(src, K)

    return {
        'index'         : index,
        'distance_type' : distance,
        'src2trg_dist'  : src2trg_dist,
        'src2trg_id'    : src2trg_id
    }


In [9]:
index_ours_ip = create_index('IP', vec_ours_zh['embedding'], vec_ours_en['embedding'])
# index_ours_l2 = create_index('L2', vec_ours_zh['embedding'], vec_ours_en['embedding'])

index_wiki_ip = create_index('IP', vec_wiki_zh['embedding'], vec_wiki_en['embedding'])
# index_wiki_l2 = create_index('L2', vec_wiki_zh['embedding'], vec_wiki_en['embedding'])


# Check sanity

In [10]:
def check_sanity(index, vec_trg):
    nearest_dist, nearest_id = index['index'].search(vec_trg['embedding'][1000:1010], K)
    words = [vec_trg['id2word'][i] for i in range(1000, 1010)]
    for i in range(len(words)):
        word = words[i]
        print('Word :', word)

        # word_id = vec_trg['word2id'][word]
        nearest_words = [vec_trg['id2word'][nearest_id[i, j]] for j in range(K)]
        nearest_words_dist = nearest_dist[i]

        print('Nearest words:')
        for i in range(K):
            print(f'{nearest_words[i]} ({nearest_words_dist[i]})')
        print('='*60)


In [11]:
check_sanity(index_ours_ip, vec_ours_en)
# check_sanity(index_ours_l2, vec_ours_en)

check_sanity(index_wiki_ip, vec_wiki_en)
# check_sanity(index_wiki_l2, vec_wiki_en)


Word : wars
Nearest words:
wars (33.9265022277832)
warfare (26.067710876464844)
jedi (23.048994064331055)
kylo (20.80528450012207)
vader (20.708839416503906)
Word : lather
Nearest words:
lather (24.970096588134766)
goop (19.940879821777344)
flog (17.8062686920166)
welt (17.789745330810547)
slash (16.381572723388672)
Word : therapy
Nearest words:
therapy (40.570106506347656)
theraphy (23.68070411682129)
therape (22.41028594970703)
kidsafe (21.02881622314453)
mesotherapy (19.31230354309082)
Word : drop
Nearest words:
drop (25.409725189208984)
drib (16.737855911254883)
dangle (16.11564826965332)
drop-off (15.995134353637695)
driblet (15.969258308410645)
Word : like
Nearest words:
like (28.236927032470703)
ilk (27.70448875427246)
alike (23.638751983642578)
comparable (22.66981315612793)
similar (21.006755828857422)
Word : j
Nearest words:
j (37.8682746887207)
joule (35.87204360961914)
js (19.0262393951416)
ruin (9.969573974609375)
dice (9.537631034851074)
Word : slingbag
Nearest words:
sli

# Check average top-1 index distance

In [12]:
dist_wiki = np.empty((len(index_wiki_ip['src2trg_dist'])))
for i in range(len(index_wiki_ip['src2trg_dist'])):
    dist_wiki[i] = index_wiki_ip['src2trg_dist'][i][0]


In [13]:
dist_wiki_sr = pd.Series(dist_wiki)
dist_wiki_sr.describe()


count    332647.000000
mean          0.262980
std           0.123026
min           0.025104
25%           0.155658
50%           0.238525
75%           0.370603
max           0.702280
dtype: float64

In [14]:
dist_ours = np.empty((len(index_ours_ip['src2trg_dist'])))
for i in range(len(index_ours_ip['src2trg_dist'])):
    dist_ours[i] = index_ours_ip['src2trg_dist'][i][0]


In [15]:
dist_ours_sr = pd.Series(dist_ours)
dist_ours_sr.describe()



count    309334.000000
mean          8.664581
std           3.561491
min           2.586657
25%           6.146866
50%           7.619271
75%          10.336281
max          43.770702
dtype: float64

# Dataset

In [16]:
val_en = pd.read_csv('./data/csv/dev_en.csv')
val_en = val_en['translation_output']

val_tcn = pd.read_csv('./data/csv/dev_tcn.csv', usecols=['text'])
val_tcn = val_tcn['text']
val_tcn = val_tcn.apply(lambda t:OTHERS_PATTERN.sub(' ', t))

test_tcn = pd.read_csv('./data/csv/test_tcn.csv', usecols=['text'])
test_tcn = test_tcn['text']
test_tcn = test_tcn.apply(lambda t:OTHERS_PATTERN.sub(' ', t))


# Create dict for missing token in Vector from Google Translate (unused)

In [17]:
# from googletrans import Translator
# translator = Translator(service_urls=['translate.google.com'])

# def flatten_2d(old_list):
#     new_list = []
#     for sublist in old_list:
#         for item in sublist:
#             new_list.append(item)
#     return new_list

# def translate_token(token):
#     batch_token = []
#     for i in range(0, len(token), 250):
#         if i+250 >= len(token):
#             curr_batch = token[i:len(token)]
#         else:
#             curr_batch = token[i:i+250]
#         string_batch ='\n'.join(curr_batch)
#         batch_token.append(string_batch)

#     translated_token = []
#     for t in batch_token:
#         tr_object = translator.translate(t, src='zh-tw', dest='en')
#         curr_batch_translated = tr_object.text
#         curr_batch_translated = curr_batch_translated.split('\n')
#         translated_token = translated_token + curr_batch_translated

#     return translated_token

# def translate_missing_tokens(vec_src, val, test):
#     # check parrarel token exist
#     parrarel_tokens_path = './data/csv/parrarel_tokens.csv'
#     if os.path.isfile(parrarel_tokens_path):
#         print('Parrarel token detected!')
#         df_parrarel_tokens = pd.read_csv(parrarel_tokens_path)

#         parrarel_tokens = {}
#         for i in df_parrarel_tokens.index:
#             parrarel_tokens[df_parrarel_tokens.loc[i, 'zh']] = df_parrarel_tokens.loc[i, 'en'].lower()
#         return parrarel_tokens

#     # get all tokens
#     sentences = val.to_list() + test.to_list()
#     print('Total sentences:', len(sentences))
#     tokens = []
#     for s in sentences:
#         result = jieba.tokenize(s, mode='default')
#         token = [r[0] for r in result]
#         token = [t for t in token if t != '' and t != ' ']
#         tokens.append(token)
#     for s in sentences:
#         result = jieba.tokenize(s, mode='default')
#         token = [r[0] for r in result]
#         token = [t for t in token if t != '' and t != ' ']
#         tokens.append(token)
#     tokens = flatten_2d(tokens)
#     print('Total token:', len(tokens))
#     tokens = list(set(tokens))
#     print('Total unique token:', len(tokens))

#     # filter token
#     tokens = [t for t in tokens if re.sub(r'[^\u4e00-\u9fff]', '', t) != '']
#     print('Total CJK unicode token:', len(tokens))

#     # get missing tokens in vector
#     missing_tokens = []
#     for t in tokens:
#         try:
#             vec_src['word2id'][t]
#         except KeyError:
#             missing_tokens.append(t)
#     print('Total missing token:', len(missing_tokens))

#     # translate all missing token
#     print('Translating missing token...')
#     translated_token = translate_token(missing_tokens)

#     # save to file
#     df_parrarel_tokens = pd.DataFrame({
#         'zh': missing_tokens,
#         'en': translated_token
#     })
#     df_parrarel_tokens.to_csv(parrarel_tokens_path, index=False)
#     print('Parrarel token saved at', parrarel_tokens_path)

#     parrarel_tokens = {}
#     for i in df_parrarel_tokens.index:
#         parrarel_tokens[df_parrarel_tokens.loc[i, 'zh']] = df_parrarel_tokens.loc[i, 'en'].lower()

#     return parrarel_tokens


In [18]:
# parrarel_tokens = translate_missing_tokens(vec_ours_zh, val_tcn, test_tcn)


# Translate function

In [19]:
def translate_word_ours(word):
    word_id = vec_ours_zh['word2id'][word]
    english_ids = index_ours_ip['src2trg_id'][word_id]

    tr_word = vec_ours_en['id2word'][english_ids[0]]
    tr_dist = index_ours_ip['src2trg_dist'][word_id][0]

    tr_word = tr_word.lower()
    tr_word = re.sub(r'[^a-z]', '', tr_word)

    return tr_word, tr_dist

def translate_word_wiki(word):
    word_id = vec_wiki_zh['word2id'][word]
    english_ids = index_wiki_ip['src2trg_id'][word_id]

    tr_word = vec_wiki_en['id2word'][english_ids[0]]
    tr_dist = index_wiki_ip['src2trg_dist'][word_id][0]

    tr_word = tr_word.lower()
    tr_word = re.sub(r'[^a-z]', '', tr_word)

    return tr_word, tr_dist

def translate_word(word, *args):
    min_range_ours = args[0]
    min_range_wiki = args[1]
    priority = args[2]

    if word != re.sub(r'[^\u4e00-\u9fff]', '', word):
        return word
    else:
        try:
            tr_word_ours, tr_dist_ours = translate_word_ours(word)
        except KeyError:
            tr_word_ours = None
        except Exception as ex:
            tr_word_ours = None
            logging.exception(f'def translate_word_ours')
            logging.exception(f'Word: {word}')
            logging.exception(ex)


        try:
            tr_word_wiki, tr_dist_wiki = translate_word_wiki(word)
        except KeyError:
            tr_word_wiki = None
        except Exception as ex:
            tr_word_ours = None
            logging.exception(f'def translate_word_wiki')
            logging.exception(f'Word: {word}')
            logging.exception(ex)

        if tr_word_ours is None and tr_word_wiki is None:
            # zh word not in both vector 
            return word
        elif tr_word_ours is not None and tr_word_wiki is not None:
            # zh word in both vector
            if priority == 'wiki':
                if tr_dist_ours < min_range_ours:
                    return tr_word_ours
                elif tr_dist_wiki < min_range_wiki:
                    return tr_word_wiki
                else:
                    return word
            else: # ours
                if tr_dist_wiki < min_range_wiki:
                    return tr_word_wiki
                elif tr_dist_ours < min_range_ours:
                    return tr_word_ours
                else:
                    return word
        elif tr_word_ours is not None:
            # zh word only in ours vector
            if tr_dist_ours < min_range_ours:
                return tr_word_ours
            else:
                return word
        elif tr_word_wiki is not None:
            # zh word only in wiki vector
            if tr_dist_wiki < min_range_wiki:
                return tr_word_wiki
            else:
                return word
        else:
            logging.warning("Reached 'else' on def translate_word")
            return word


def tokenize_tcn(text, mode):
    try:
        result = jieba.tokenize(text, mode=mode)
        token = [r[0] for r in result]
    except Exception as ex:
        logging.exception(f'def tokenize_tcn')
        logging.exception(f'Text: {text}')
        logging.exception(ex)
        token = []
    finally:
        return token

def translate_sentence(sentence, tokenize_mode, *args):
    '''
    tokenize_mode = 'default' or 'search'
    '''
    # tokenize
    sentence = sentence.lower()
    token = tokenize_tcn(sentence, mode=tokenize_mode)
    token = [t for t in token if t != '' and t != ' ']

    # translate
    new_token = []
    for t in token:
        new_token.append(translate_word(t, *args))
    new_sentence = ' '.join(new_token)

    return new_sentence


# Translate & eval validation data

In [20]:
def eval(preds: List[str], refs: List[str]) -> float:
    """BLEU score computation.

    Strips all characters belonging to the unicode category "So".
    Tokenize with standard WMT "13a" tokenizer.
    Compute 4-BLEU.

    Args:
        preds (List[str]): List of translated texts.
        refs (List[str]): List of target reference texts.
    """
    preds = [OTHERS_PATTERN.sub(' ', text) for text in preds]
    refs = [OTHERS_PATTERN.sub(' ', text) for text in refs]
    return corpus_bleu(
        preds, [refs],
        lowercase=True,
        tokenize='13a',
        use_effective_order= False
    ).score


In [21]:
tokenize_mode = ['search', 'default']
min_range_ours = [v/10 for v in range(50, 450, 25)]
min_range_wiki = [v/100 for v in range(20, 75, 5)]
priority = ['wiki', 'ours']

translate_param = list(itertools.product(*[tokenize_mode, min_range_ours, min_range_wiki, priority]))
print('Total translate param permutation:', len(translate_param))

translated_val_tcn_list = []
for i in range(len(translate_param)):
    logging.info(f'Start translating validation data with param #{i}')
    logging.info(translate_param[i])
    translated_val_tcn = []
    for j in val_tcn.index:
        translated_val_tcn.append(
            translate_sentence(val_tcn[j], *translate_param[i])
        )
    translated_val_tcn_list.append(translated_val_tcn)


Building prefix dict from the default dictionary ...
Total translate param permutation: 704
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.734 seconds.
Prefix dict has been built successfully.


In [22]:
best_index = 0
best_bleu_score = 0
for i in range(len(translated_val_tcn_list)):
    bleu_score = eval(translated_val_tcn_list[i], val_en.to_list())
    logging.info(f'Bleu score for param {i}: {bleu_score}')
    print(f'Bleu score for param {i}: {bleu_score}')
    if bleu_score > best_bleu_score:
        best_index = i
        best_bleu_score = bleu_score


Bleu score for param 0: 19.12699204234897
Bleu score for param 1: 19.12699204234897
Bleu score for param 2: 19.130286443528398
Bleu score for param 3: 19.130286443528398
Bleu score for param 4: 19.132372990051543
Bleu score for param 5: 19.132372990051543
Bleu score for param 6: 19.147482372287325
Bleu score for param 7: 19.147482372287325
Bleu score for param 8: 19.461353188326058
Bleu score for param 9: 19.461353188326058
Bleu score for param 10: 20.123357104568186
Bleu score for param 11: 20.123357104568186
Bleu score for param 12: 21.92009976838403
Bleu score for param 13: 21.92009976838403
Bleu score for param 14: 24.869039012244265
Bleu score for param 15: 24.870291387371598
Bleu score for param 16: 26.599534607816775
Bleu score for param 17: 26.600812111103497
Bleu score for param 18: 27.022394390164663
Bleu score for param 19: 27.02367912140134
Bleu score for param 20: 27.16720939951434
Bleu score for param 21: 27.168493203095164
Bleu score for param 22: 19.12699204234897
Bleu 

# Translate test data

In [23]:
print(f'Start translating test data with best param #{best_index}')
print(translate_param[best_index])

logging.info(f'Start translating test data with best param #{best_index}')
logging.info(translate_param[best_index])

translated_test_tcn = []

for i in test_tcn.index:
    translated_test_tcn.append(translate_sentence(test_tcn[i], *translate_param[best_index]))


Start translating test data with best param #373
('default', 5.0, 0.7, 'ours')


In [24]:
df_submission = pd.DataFrame(translated_test_tcn, columns=['translation_output'])
df_submission


Unnamed: 0,translation_output
0,【 polarstar 】 美麗諾 wool warmth stockings 『 淺灰 』...
1,sweet crystal ~ natural crystal divination 珠手 ...
2,粉晶 hexagon 柱純 銀項
3,3m scotch vhb 超強力 hexbreaker - outdoor v1808
4,lights exclusive discounts * 4 box
...,...
9995,rains backpack backpack briefcase 筆電包 denmark...
9996,airwalk wickies 童鞋 blue 中童 a823230180 no002
9997,norns 【 disney 5000mah action electrict ( 泰瑞 c...
9998,derwent 達爾文設 arsenic deluxe 12 ( 6b - 4h ...


In [25]:
df_submission.to_csv(f'./submission_{START_DATETIME}.csv', index=False)
