In [1]:
!pip3 install -q opencc emoji sklearn regex

In [2]:
import subprocess, gc, emoji, re, regex

import opencc

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
#s2t.json Simplified Chinese to Traditional Chinese 簡體到繁體
#t2s.json Traditional Chinese to Simplified Chinese 繁體到簡體
#s2tw.json Simplified Chinese to Traditional Chinese (Taiwan Standard) 簡體到臺灣正體
#tw2s.json Traditional Chinese (Taiwan Standard) to Simplified Chinese 臺灣正體到簡體
#s2hk.json Simplified Chinese to Traditional Chinese (Hong Kong variant) 簡體到香港繁體
#hk2s.json Traditional Chinese (Hong Kong variant) to Simplified Chinese 香港繁體到簡體
#s2twp.json Simplified Chinese to Traditional Chinese (Taiwan Standard) with Taiwanese idiom 簡體到繁體（臺灣正體標準）並轉換爲臺灣常用詞彙
#tw2sp.json Traditional Chinese (Taiwan Standard) to Simplified Chinese with Mainland Chinese idiom 繁體（臺灣正體標準）到簡體並轉換爲中國大陸常用詞彙
#t2tw.json Traditional Chinese (OpenCC Standard) to Taiwan Standard 繁體（OpenCC 標準）到臺灣正體
#hk2t.json Traditional Chinese (Hong Kong variant) to Traditional Chinese 香港繁體到繁體（OpenCC 標準）
#t2hk.json Traditional Chinese (OpenCC Standard) to Hong Kong variant 繁體（OpenCC 標準）到香港繁體
#t2jp.json Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji (Shinjitai) 繁體（OpenCC 標準，舊字體）到日文新字體
#jp2t.json New Japanese Kanji (Shinjitai) to Traditional Chinese Characters (Kyūjitai) 日文新字體到繁體（OpenCC 標準，舊字體）
#tw2t.json Traditional Chinese (Taiwan standard) to Traditional Chinese 臺灣正體到繁體（OpenCC 標準）
converter_tw2s = opencc.OpenCC('tw2s.json')
converter_t2s = opencc.OpenCC('t2s.json')
converter_hk2s = opencc.OpenCC('hk2s.json')

### Load and clean up datasets

In [30]:
#p = emoji.get_emoji_regexp()
p = regex.compile(r'\p{So}')

def load(filename):
    return pd.read_csv(filename).fillna('')

def preprocess(df_, apply_chinese_simplification=True):
    df = df_[[df_.columns[0]]].copy()
    df.columns = ['product_title']
    df['product_title'] = df['product_title'].str.replace('\n', ' ')
    df['product_title'] = df['product_title'].str.replace('\"', ' ')
    df['product_title'] = df['product_title'].str.replace(',', ' ')
    #df['product_title'] = [re.sub(p, r"", x) for x in df['product_title'].tolist()]
    df['product_title'] = [p.sub(" ", x) for x in df['product_title'].tolist()]
    if apply_chinese_simplification:
        df['product_title'] = df['product_title']\
                              .apply(converter_tw2s.convert)\
                              .apply(converter_t2s.convert)\
                              .apply(converter_hk2s.convert)
    return df

def load_and_preprocess(filename, apply_chinese_simplification=True):
    print("loading and processing {}...".format(filename))
    return preprocess(load(filename), apply_chinese_simplification)

In [31]:
%%time
filenames = ['product-translation-dataset/train_tcn.csv',
             'product-translation-dataset/train_en.csv',
             'product-translation-dataset/test_tcn.csv',
             'product-translation-dataset/translations/test_en.csv',
             'product-translation-dataset/dev_tcn.csv',
             'product-translation-dataset/dev_en.csv']
df_train_tcn, df_train_en, df_test_tcn, df_test_en, df_dev_tcn, df_dev_en = [load_and_preprocess(f, True) for f in filenames]
# df_train_tcn, df_train_en, df_test_tcn, df_dev_tcn, df_dev_en = [load_and_preprocess(f, True) for f in filenames]
# df_train_tcn, df_train_en, df_test_tcn, df_dev_tcn, df_dev_en = map(load_and_preprocess, filenames)
# df_dev_tcn, df_dev_en = map(load_and_preprocess, filenames)

loading and processing product-translation-dataset/train_tcn.csv...
loading and processing product-translation-dataset/train_en.csv...
loading and processing product-translation-dataset/test_tcn.csv...
loading and processing product-translation-dataset/translations/test_en.csv...
loading and processing product-translation-dataset/dev_tcn.csv...
loading and processing product-translation-dataset/dev_en.csv...
Wall time: 2min


In [32]:
df_tcn2en = pd.read_csv('product-translation-dataset/translations/train_tcn2en.csv')
df_en2tcn = pd.read_csv('product-translation-dataset/translations/train_en2tcn.csv')

df_tcn2en_tcn = df_tcn2en[['product_title']].copy()
df_tcn2en_en  = df_tcn2en[['translated_output']].copy()
df_tcn2en_tcn = preprocess(df_tcn2en_tcn, True)
df_tcn2en_en  = preprocess(df_tcn2en_en,  True)

df_en2tcn_tcn = df_en2tcn[['translated_output']].copy()
df_en2tcn_en  = df_en2tcn[['product_title']].copy()
df_en2tcn_tcn = preprocess(df_en2tcn_tcn, True)
df_en2tcn_en  = preprocess(df_en2tcn_en,  True)

### Clean up datasets

In [33]:
df_train_tcn = df_train_tcn[df_train_tcn['product_title'].str.len() >= 2]
df_train_en  = df_train_en[df_train_en['product_title'].str.len() >= 2]

In [34]:
dev_idx = (df_dev_tcn['product_title'].str.len() >= 2) & (df_dev_en['product_title'].str.len() >= 2)
df_dev_tcn = df_dev_tcn[dev_idx]
df_dev_en  = df_dev_en[dev_idx]

In [35]:
test_idx = (df_test_tcn['product_title'].str.len() >= 2) & (df_test_en['product_title'].str.len() >= 2)
df_test_tcn = df_test_tcn[test_idx]
df_test_en  = df_test_en[test_idx]

In [36]:
tcn2en_idx = (df_tcn2en_tcn['product_title'].str.len() >= 2) & (df_tcn2en_en['product_title'].str.len() >= 2)
df_tcn2en_tcn = df_tcn2en_tcn[tcn2en_idx]
df_tcn2en_en  = df_tcn2en_en[tcn2en_idx]

In [37]:
en2tcn_idx = (df_en2tcn_tcn['product_title'].str.len() >= 2) & (df_en2tcn_en['product_title'].str.len() >= 2)
df_en2tcn_tcn = df_en2tcn_tcn[en2tcn_idx]
df_en2tcn_en  = df_en2tcn_en[en2tcn_idx]

### Concatenate datasets

In [38]:
df_mono_zh_all = df_train_tcn
df_mono_en_all = df_train_en
print("MONO [ZH,EN] ALL:", len(df_mono_zh_all), len(df_mono_en_all))

MONO [ZH,EN] ALL: 499816 499992


In [39]:
df_para_zh_all = pd.concat([df_dev_tcn, df_test_tcn, df_tcn2en_tcn, df_en2tcn_tcn]).reset_index().drop('index', axis=1)
df_para_en_all = pd.concat([df_dev_en , df_test_en,  df_tcn2en_en , df_en2tcn_en]).reset_index().drop('index', axis=1)
print("PARA [ZH,EN] ALL:", len(df_para_zh_all), len(df_para_en_all))

PARA [ZH,EN] ALL: 30995 30995


In [40]:
df_all_zh_all = pd.concat([df_mono_zh_all, df_para_zh_all]).reset_index().drop('index', axis=1)
df_all_en_all = pd.concat([df_mono_en_all, df_para_en_all]).reset_index().drop('index', axis=1)
print("ALL [ZH,EN] ALL:", len(df_all_zh_all), len(df_all_en_all))

ALL [ZH,EN] ALL: 530811 530987


### Split datasets

In [41]:
def train_valid_test_split(df_all, alpha=0.04, beta=0.5, random_state=0):
    df_all = df_all.sample(frac=1.0, random_state=random_state)
    df_train, df_test = train_test_split(df_all,  test_size=alpha, random_state=random_state)
    df_valid, df_test = train_test_split(df_test, test_size=beta,  random_state=random_state)
    print("[TRAIN,VALID,TEST] : {:>6} {:>6} {:>6}".format(len(df_train), len(df_valid), len(df_test)))
    return df_train, df_valid, df_test

In [42]:
df_mono_zh_train, df_mono_zh_valid, df_mono_zh_test = train_valid_test_split(df_mono_zh_all)
df_mono_en_train, df_mono_en_valid, df_mono_en_test = train_valid_test_split(df_mono_en_all)
df_para_zh_train, df_para_zh_valid, df_para_zh_test = train_valid_test_split(df_para_zh_all, 0.1)
df_para_en_train, df_para_en_valid, df_para_en_test = train_valid_test_split(df_para_en_all, 0.1)

[TRAIN,VALID,TEST] : 479823   9996   9997
[TRAIN,VALID,TEST] : 479992  10000  10000
[TRAIN,VALID,TEST] :  27895   1550   1550
[TRAIN,VALID,TEST] :  27895   1550   1550


In [43]:
df_para_en_train.tail()

Unnamed: 0,product_title
3755,Horizontal stripes grape mt and paper tape
25817,Ddult cotton spandex big size terno tokong for...
30127,Live check out for maam chanti
17763,S-LV bag (Python-style fashion shoulder slanti...
1418,Free shipping/thin section plus size plus size...


In [44]:
df_para_zh_train.tail()

Unnamed: 0,product_title
3755,横纹・葡萄 mt和纸胶带
25817,女士Ddult棉氨纶大号Terno Tokong
30127,现场检查Maam Chanti
17763,S-LV 包包（蟒蛇纹时尚手提单肩斜跨包）「认明Yuanroro优质
1418,全馆免运/薄款加大码女宽松破洞牛仔短裤 YX041729


### Save preprocessed datasets

In [45]:
df_mono_zh_all.to_csv(  'mass/all.zh',        header=False, index=False)
df_mono_en_all.to_csv(  'mass/all.en',        header=False, index=False)

In [46]:
df_mono_zh_train.to_csv('mass/mono/train.zh', header=False, index=False)
df_mono_en_train.to_csv('mass/mono/train.en', header=False, index=False)
df_mono_zh_valid.to_csv('mass/mono/valid.zh', header=False, index=False)
df_mono_en_valid.to_csv('mass/mono/valid.en', header=False, index=False)
df_mono_zh_test.to_csv( 'mass/mono/test.zh',  header=False, index=False)
df_mono_en_test.to_csv( 'mass/mono/test.en',  header=False, index=False)

In [47]:
df_para_zh_train.to_csv('mass/para/train.zh', header=False, index=False)
df_para_en_train.to_csv('mass/para/train.en', header=False, index=False)
df_para_zh_valid.to_csv('mass/para/valid.zh', header=False, index=False)
df_para_en_valid.to_csv('mass/para/valid.en', header=False, index=False)
df_para_zh_test.to_csv( 'mass/para/test.zh',  header=False, index=False)
df_para_en_test.to_csv( 'mass/para/test.en',  header=False, index=False)

In [48]:
print((df_para_zh_train['product_title'].str.len() == 1).sum())
print((df_mono_en_train['product_title'].str.len() == 1).sum())
print((df_para_zh_train['product_title'].str.len() == 1).sum())
print((df_para_en_train['product_title'].str.len() == 1).sum())

0
0
0
0


In [11]:
df_mono_zh_all.to_csv(  "product-translation-dataset/clean/mono_zh_all.csv", index=False)
df_mono_en_all.to_csv(  "product-translation-dataset/clean/mono_en_all.csv", index=False)
df_mono_zh_train.to_csv("product-translation-dataset/clean/mono_zh_train.csv", index=False)
df_mono_en_train.to_csv("product-translation-dataset/clean/mono_en_train.csv", index=False)
df_mono_zh_valid.to_csv("product-translation-dataset/clean/mono_zh_valid.csv", index=False)
df_mono_en_valid.to_csv("product-translation-dataset/clean/mono_en_valid.csv", index=False)
df_mono_zh_test.to_csv( "product-translation-dataset/clean/mono_zh_test.csv", index=False)
df_mono_en_test.to_csv( "product-translation-dataset/clean/mono_en_test.csv", index=False)
df_para_zh_valid.to_csv("product-translation-dataset/clean/para_zh_valid.csv", index=False)
df_para_en_valid.to_csv("product-translation-dataset/clean/para_en_valid.csv", index=False)
df_para_zh_test.to_csv( "product-translation-dataset/clean/para_zh_test.csv", index=False)
df_para_en_test.to_csv( "product-translation-dataset/clean/para_en_test.csv", index=False)

In [12]:
df_all_clean = pd.concat(
    [df_mono_zh_all,   df_mono_en_all,
     df_mono_zh_train, df_mono_en_train,
     df_mono_zh_valid, df_mono_en_valid,
     df_mono_zh_test,  df_mono_en_test,
     df_para_zh_valid, df_para_en_valid, 
     df_para_zh_test,  df_para_en_test
    ]
).reset_index().drop('index', axis=1)

In [13]:
df_all_clean.to_csv("product-translation-dataset/clean/all_clean.csv", index=False)
df_train_tcn.to_csv("product-translation-dataset/clean/train_zh_clean.csv", index=False)
df_train_en.to_csv( "product-translation-dataset/clean/train_en_clean.csv", index=False)
df_test_tcn.to_csv( "product-translation-dataset/clean/test_zh_clean.csv", index=False)
df_dev_tcn.to_csv(  "product-translation-dataset/clean/dev_zh_clean.csv", index=False)
df_dev_en.to_csv(   "product-translation-dataset/clean/dev_en_clean.csv", index=False)

### Chunk train datasets

In [14]:
def partition_dataset(df, lines_per_part=15000):
    df = df.sample(frac=1.0)
    partitions = []
    for g, df_part in df.groupby(np.arange(len(df)) // lines_per_part):
        partitions.append(df_part)
    return partitions

In [15]:
df_tcn_chunks = partition_dataset(df_train_tcn, 10000)
df_en_chunks  = partition_dataset(df_train_en,  10000)

In [16]:
for i, df in enumerate(df_tcn_chunks):
    df.to_csv("product-translation-dataset/train_tcn_chunks/part_{}.csv".format(i+1), index=False)

In [17]:
for i, df in enumerate(df_en_chunks):
    df.to_csv("product-translation-dataset/train_en_chunks/part_{}.csv".format(i+1),  index=False)

In [24]:
df_tcn_chunk_1 = pd.read_csv('product-translation-dataset/train_tcn_chunks/part_1.csv')
df_en_chunk_1 = pd.read_csv('product-translation-dataset/train_en_chunks/part_1.csv')

In [32]:
int(df_tcn_chunk_1['product_title'].str.len().sum())

324372

In [33]:
int(df_en_chunk_1['product_title'].str.len().sum())

370912

In [None]:
329277 + 372034