In [6]:
!pip install opencc

Collecting opencc
  Downloading OpenCC-1.1.1-py2.py3-none-win_amd64.whl (726 kB)
Installing collected packages: opencc
Successfully installed opencc-1.1.1


In [8]:
import subprocess, gc, emoji, re

import opencc

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [26]:
#s2t.json Simplified Chinese to Traditional Chinese 簡體到繁體
#t2s.json Traditional Chinese to Simplified Chinese 繁體到簡體
#s2tw.json Simplified Chinese to Traditional Chinese (Taiwan Standard) 簡體到臺灣正體
#tw2s.json Traditional Chinese (Taiwan Standard) to Simplified Chinese 臺灣正體到簡體
#s2hk.json Simplified Chinese to Traditional Chinese (Hong Kong variant) 簡體到香港繁體
#hk2s.json Traditional Chinese (Hong Kong variant) to Simplified Chinese 香港繁體到簡體
#s2twp.json Simplified Chinese to Traditional Chinese (Taiwan Standard) with Taiwanese idiom 簡體到繁體（臺灣正體標準）並轉換爲臺灣常用詞彙
#tw2sp.json Traditional Chinese (Taiwan Standard) to Simplified Chinese with Mainland Chinese idiom 繁體（臺灣正體標準）到簡體並轉換爲中國大陸常用詞彙
#t2tw.json Traditional Chinese (OpenCC Standard) to Taiwan Standard 繁體（OpenCC 標準）到臺灣正體
#hk2t.json Traditional Chinese (Hong Kong variant) to Traditional Chinese 香港繁體到繁體（OpenCC 標準）
#t2hk.json Traditional Chinese (OpenCC Standard) to Hong Kong variant 繁體（OpenCC 標準）到香港繁體
#t2jp.json Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji (Shinjitai) 繁體（OpenCC 標準，舊字體）到日文新字體
#jp2t.json New Japanese Kanji (Shinjitai) to Traditional Chinese Characters (Kyūjitai) 日文新字體到繁體（OpenCC 標準，舊字體）
#tw2t.json Traditional Chinese (Taiwan standard) to Traditional Chinese 臺灣正體到繁體（OpenCC 標準）
converter = opencc.OpenCC('tw2s.json')

### Load datasets

In [41]:
df_train_tcn = pd.read_csv('product-translation-dataset/train_tcn.csv')
df_train_en  = pd.read_csv('product-translation-dataset/train_en.csv')
df_test_tcn  = pd.read_csv('product-translation-dataset/test_tcn.csv')
df_dev_tcn = pd.read_csv('product-translation-dataset/dev_tcn.csv')
df_dev_en  = pd.read_csv('product-translation-dataset/dev_en.csv')

In [42]:
df_test_tcn.columns = ['product_title', 'split']
df_dev_tcn.columns = ['product_title', 'split']
df_dev_en.columns = ['product_title']

In [43]:
df_train_tcn = df_train_tcn.dropna()
df_train_en  = df_train_en.dropna()
df_test_tcn  = df_test_tcn.dropna()
df_dev_tcn   = df_dev_tcn.dropna()
df_dev_en    = df_dev_en.dropna()

In [44]:
df_train_tcn = df_train_tcn[~((df_train_tcn["product_title"].str.contains('\n')) & 
                              (df_train_tcn["product_title"].str.contains('\"')) & 
                              (df_train_tcn["product_title"].str.contains(',')))]
df_train_en  = df_train_en[~((df_train_en["product_title"].str.contains('\n')) & 
                             (df_train_en["product_title"].str.contains('\"')) & 
                             (df_train_en["product_title"].str.contains(',')))]
df_test_tcn  = df_test_tcn[~((df_test_tcn["product_title"].str.contains('\n')) & 
                             (df_test_tcn["product_title"].str.contains('\"')) & 
                             (df_test_tcn["product_title"].str.contains(',')))]
df_dev_tcn   = df_dev_tcn[~((df_dev_tcn["product_title"].str.contains('\n')) & 
                            (df_dev_tcn["product_title"].str.contains('\"')) & 
                            (df_dev_tcn["product_title"].str.contains(',')))]
df_dev_en    = df_dev_en[~((df_dev_en["product_title"].str.contains('\n')) & 
                           (df_dev_en["product_title"].str.contains('\"')) & 
                           (df_dev_en["product_title"].str.contains(',')))]

In [45]:
df_train_tcn['product_title'] = df_train_tcn['product_title'].apply(converter.convert)
df_train_en['product_title']  = df_train_en['product_title'].apply(converter.convert)
df_test_tcn['product_title']  = df_test_tcn['product_title'].apply(converter.convert)
df_dev_tcn['product_title']   = df_dev_tcn['product_title'].apply(converter.convert)
df_dev_en['product_title']    = df_dev_en['product_title'].apply(converter.convert)

In [46]:
%%time
p = emoji.get_emoji_regexp()
df_train_tcn['product_title'] = [re.sub(p, r"", x) for x in df_train_tcn['product_title'].tolist()]
df_train_en['product_title']  = [re.sub(p, r"", x) for x in df_train_en['product_title'].tolist()]
df_test_tcn['product_title']  = [re.sub(p, r"", x) for x in df_test_tcn['product_title'].tolist()]
df_dev_tcn['product_title']   = [re.sub(p, r"", x) for x in df_dev_tcn['product_title'].tolist()]
df_dev_en['product_title']    = [re.sub(p, r"", x) for x in df_dev_en['product_title'].tolist()]

Wall time: 2min 42s


In [52]:
df_train_tcn = df_train_tcn[['product_title']]
df_train_en  = df_train_en[['product_title']]
df_test_tcn  = df_test_tcn[['product_title']]
df_dev_tcn   = df_dev_tcn[['product_title']]
df_dev_en    = df_dev_en[['product_title']]

In [53]:
df_mono_train_tcn, df_mono_valid_tcn, df_mono_train_en, df_mono_valid_en = \
    train_test_split(df_train_tcn.iloc[:499744], df_train_en.iloc[:499744], test_size=0.1, random_state=42)

In [54]:
df_para_train_tcn, df_para_valid_tcn, df_para_train_en, df_para_valid_en = \
    train_test_split(df_dev_tcn, df_dev_en, test_size=0.2, random_state=42)

In [55]:
df_mono_train_tcn.to_csv('mass/data/mono/train.zh', header=False, index=False)
df_mono_train_en.to_csv('mass/data/mono/train.en', header=False, index=False)
df_mono_valid_tcn.to_csv('mass/data/mono/valid.zh', header=False, index=False)
df_mono_valid_en.to_csv('mass/data/mono/valid.en', header=False, index=False)

In [56]:
df_para_train_tcn.to_csv('mass/data/para/train.zh', header=False, index=False)
df_para_train_en.to_csv('mass/data/para/train.en', header=False, index=False)
df_para_valid_tcn.to_csv('mass/data/para/valid.zh', header=False, index=False)
df_para_valid_en.to_csv('mass/data/para/valid.en', header=False, index=False)