In [1]:
import json
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [2]:
def isChinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    
    return False

def init():
    lists = []
    with open('./translation2019zh/translation2019zh_train.json','r', encoding='utf-8') as dat_f:
        data = []
        for i,line in enumerate(dat_f):
            data = json.loads(line)

            if isChinese(data['chinese']) == True:
                if len(data['chinese'])<6:
                    lists.append(data)
                if (len(lists)+1)%100 == 0:
                    break
    
    df = pd.DataFrame(lists)
    df.to_csv('datafile.csv', encoding='utf-8', index=False)

init()

In [3]:
df = pd.read_csv('datafile.csv')
df['chinese'] = df['chinese'].apply(lambda x: '@' + x + '。')
en_data = df.english.values.tolist()
ch_data = df.chinese.values.tolist()

In [4]:
en_vocab = set(''.join(en_data)) # 生成字典
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}
ch_vocab = set(''.join(ch_data))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}
print('\n英文:\n', en2id)
print('\n中文\n:', ch2id)


英文:
 {'t': 0, 'O': 1, '…': 2, 'h': 3, 'i': 4, 'o': 5, 'C': 6, 'U': 7, 'B': 8, 'y': 9, 'z': 10, 'l': 11, 'r': 12, '）': 13, '"': 14, 'H': 15, 'P': 16, 'T': 17, 'x': 18, 'Y': 19, 'A': 20, '（': 21, '.': 22, "'": 23, 'a': 24, ';': 25, 'u': 26, 'p': 27, 'I': 28, '?': 29, 'm': 30, ')': 31, 'w': 32, 'n': 33, 'G': 34, 'f': 35, 'c': 36, 'M': 37, 'J': 38, ',': 39, 'K': 40, '8': 41, 'q': 42, '!': 43, 'E': 44, ' ': 45, '-': 46, 'W': 47, 'S': 48, 'X': 49, 'V': 50, 'b': 51, 'F': 52, 'v': 53, 'R': 54, 'L': 55, 'd': 56, 'e': 57, 'g': 58, 'j': 59, 'k': 60, ':': 61, 'N': 62, '(': 63, 's': 64, 'D': 65, '=': 66}

中文
: {'引': 0, '爱': 1, '铝': 2, '咱': 3, '晚': 4, '请': 5, '标': 6, '明': 7, '年': 8, '？': 9, '幸': 10, '老': 11, '九': 12, '茄': 13, '为': 14, '类': 15, '市': 16, '：': 17, '她': 18, '器': 19, '架': 20, '法': 21, '尼': 22, '势': 23, '梧': 24, '盐': 25, '线': 26, '洱': 27, '你': 28, '胃': 29, '紧': 30, '二': 31, '迷': 32, '动': 33, '颇': 34, '有': 35, '渍': 36, '母': 37, '祸': 38, '驱': 39, '进': 40, '地': 41, '除': 42, '印': 43, '校': 44

In [None]:
def numData(en2id,ch2id,en_data):
    en_num_data = [[en2id[en] for en in line ] for line in en_data]
    ch_num_data = [[ch2id[ch] for ch in line] for line in ch_data]
    de_num_data = [[ch2id[ch] for ch in line][1:] for line in ch_data]
    print('char:', en_data[1])
    print('index:', en_num_data[1])
    max_encoder_seq_length = max([len(txt) for txt in en_num_data])
    max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
    print('max encoder length:', max_encoder_seq_length)
    print('max decoder length:', max_decoder_seq_length)
    
    encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
    decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
    decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
    
    for i in range(len(ch_num_data)):
        for t, j in enumerate(en_num_data[i]):
            encoder_input_data[i, t, j] = 1.
        for t, j in enumerate(ch_num_data[i]):
            decoder_input_data[i, t, j] = 1.
        for t, j in enumerate(de_num_data[i]):
            decoder_target_data[i, t, j] = 1.
            
    print('index data:\n', en_num_data[1])
    print('one hot data:\n', encoder_input_data[1])
    return encoder_input_data,decoder_input_data,decoder_target_data

nd = numData(en2id,ch2id,en_data)