# Library

In [1]:
import platform
import os
import re
import random
import multiprocessing
import itertools
from datetime import datetime
from collections import Counter

import nltk
import jieba
import pandas as pd
import numpy as np
import seaborn as sns


In [2]:
SEED = 42

os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)


In [3]:
MAX_WORD = 200
EMBEDDING_DIMENSION = 30


In [4]:
print('Python version:', platform.python_version())
print('Jieba Version:', jieba.__version__)
print('NLTK Version:', nltk.__version__)
print('Pandas Version:', pd.__version__)
print('Numpy Version:', np.__version__)
print('Seaborn Version:', sns.__version__)
# print('FastText Version:', fasttext.__version__)


Python version: 3.8.3
Jieba Version: 0.42.1
NLTK Version: 3.5
Pandas Version: 1.0.5
Numpy Version: 1.19.0
Seaborn Version: 0.10.1


In [5]:
jieba.enable_parallel(multiprocessing.cpu_count())



Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.549 seconds.
Prefix dict has been built successfully.


# Dataset

In [6]:
train_en = pd.read_csv('./data/csv/train_en.csv', usecols=['product_title'])
train_en = train_en['product_title']

train_tcn = pd.read_csv('./data/csv/train_tcn.csv', usecols=['product_title'])
train_tcn = train_tcn['product_title']

dev_en = pd.read_csv('./data/csv/dev_en.csv', usecols=['translation_output'])
dev_en = dev_en['translation_output']

dev_tcn = pd.read_csv('./data/csv/dev_tcn.csv', usecols=['text'])
dev_tcn = dev_tcn['text']

test_tcn = pd.read_csv('./data/csv/test_tcn.csv', usecols=['text'])
test_tcn = test_tcn['text']


# Clean Text

 1. Lower case all title
 2. Remove title containing '\n', '\"' or ','
 3. Tokenize and
   * Only keep alphabet for english title
   * Only keep CJK Unified Ideographs for traditional chinese title
 4. Join token into string with whitespace as seperator

In [7]:
def clean_tcn(text):
    try:
        # 4E00—9FFF is range of CJK (Chinese, Japanese, Korean) Unified Ideographs
        # https://unicode-table.com/en/blocks/cjk-unified-ideographs/
        new_text = re.sub(r'[^\u4e00-\u9fff]', ' ', text)
        return new_text
    except Exception as ex:
        print(f'Text : {text}')
        print(ex)
        return text

def tokenize_tcn(text, mode):
    try:
        result = jieba.tokenize(text, mode=mode)
        token = [r[0] for r in result]
    except Exception as ex:
        print(f'Text : {text}')
        print(ex)
        token = []
    finally:
        return token

def tokenize_en(text):
    try:
        token = nltk.word_tokenize(text)
        token = [t for t in token if t == re.sub(r'[^a-z]', '', t)]
        return token
    except:
        return []

def token2text(token):
    try:
        text = ' '.join(token)
        return text
    except Exception as ex:
        print(f'Token : {token}')
        print(ex)
        return token

def preprocess(sr, lang='en', mode='default'):
    sr = sr.str.lower()
    sr = sr[~((sr.str.contains('\n')) & 
                (sr.str.contains('\"')) & 
                (sr.str.contains(',')))]
    
    if lang == 'en':
        sr = sr.apply(tokenize_en)
    else: # tcn
        sr = sr.apply(clean_tcn)
        sr = sr.apply(lambda t: tokenize_tcn(t, mode))
    sr = sr.apply(token2text)

    return sr


In [8]:
train_en = preprocess(train_en)
train_tcn = preprocess(train_tcn, lang='tcn', mode='default')
train_tcn2 = preprocess(train_tcn, lang='tcn', mode='search')
val_en = preprocess(train_en)
val_tcn = preprocess(train_tcn, lang='tcn', mode='default')
val_tcn2 = preprocess(train_tcn, lang='tcn', mode='search')
test_tcn = preprocess(train_tcn, lang='tcn', mode='search')
test_tcn2 = preprocess(train_tcn, lang='tcn', mode='search')


Text : nan
expected string or bytes-like object
Text : nan
jieba: the input parameter should be unicode.


In [9]:
en_sr = pd.concat([train_en, val_en], axis=0)
en_sr = en_sr.dropna().drop_duplicates(keep='first').reset_index(drop=True)
tcn_sr = pd.concat([train_tcn, train_tcn2, val_tcn, val_tcn2, test_tcn, test_tcn2], axis=0)
tcn_sr = tcn_sr.dropna().drop_duplicates(keep='first').reset_index(drop=True)


In [10]:
tcn_sr = tcn_sr.apply(lambda t: re.sub(r'\s{2,}', ' ', t))
tcn_sr = tcn_sr.str.lstrip(' ')
tcn_sr = tcn_sr.str.rstrip(' ')


In [11]:
en_sr.to_csv('./data/txt/train_en.txt', header=False, index=False)
tcn_sr.to_csv('./data/txt/train_tcn.txt', header=False, index=False)
