In [78]:
import numpy as np
import matplotlib.pyplot as plt
import jieba
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model, to_categorical
from keras.models import Model
from keras.layers import Input, Dense, LSTM
from keras.layers import Embedding

In [79]:
class Param():
    batch_size = 32
    n_epoch = 100
    learning_rate = 0.01
    decay_steps = 1000
    decay_rate = 0.9
    grad_clip = 5
    state_size = 100
    num_layers = 3
    seq_length = 20
    log_dir = './logs'
    metadata = 'metadata.tsv'
    gen_num = 500 # how many chars to generate
    
    remove_word = ['!', '(', ')', '*', '+', ',', '-', '.',
                   '...', '......', '............', '/','<',
                   '>', '?','[', '\\', ']', '`','~', '·',
                   '…', '☆', '\u3000', '。', '〇', '《', '》',
                   '〖', '〗', 'ー', 'ㄇ', 'ㄈ', 'ㄌ', 'ㄒ', 'ㄙ','！',
                   'ㄚ', 'ㄟ', 'ㄡ','（','）','，','＜','＞','？','～']   

In [102]:
class DataGenerator():
    def __init__(self, datafiles, args):
        self.seq_length = args.seq_length
        self.batch_size = args.batch_size
        self.remove_word = args.remove_word
        self.remove_word = ''.join(self.remove_word)
        
        with open(datafiles, encoding='utf-8') as f:
            self.data = f.read()
        
        self.seg_list = list(jieba.cut(self.data, cut_all=False))
        
        table = str.maketrans('','',self.remove_word)
        self.seg_list = [w.translate(table) for w in self.seg_list]
                
        # total data length
        self.total_len = len(self.seg_list)  
        self.words = list(set(self.seg_list))
        self.words.sort()
        print('Total length: {}'.format(self.total_len))
        
        # vocabulary
        self.vocab_size = len(self.words)  # vocabulary size
        print('Vocabulary Size:', self.vocab_size)
        
        # dictionary
        self.char2id_dict = {w: i for i, w in enumerate(self.words)}
        self.id2char_dict = {i: w for i, w in enumerate(self.words)}
        
        # pointer position to generate current batch
        self._pointer = 0
        # save metadata file
        self.save_metadata(args.metadata)
        
    def char2id(self, c):
        return self.char2id_dict[c]
    
    def id2char(self, id):
        return self.id2char_dict[id]
    
    def save_metadata(self, file):
        with open(file, 'w', encoding="utf-8") as f:
            f.write('id\tchar\n')
            for i in range(self.vocab_size):
                c = self.id2char(i)
                f.write('{}\t{}\n'.format(i, c))
                
#     def create_tokenizer(self):
#         tokenizer = Tokenizer()
#         tokenizer.fit_on_texts(self.seg_list)
#         return tokenizer
    
    def next_batch(self):
        x_batches = []
        y_batches = []
        for i in range(self.batch_size):
            if self._pointer + self.seq_length + 1 >= self.total_len:
                self._pointer = 0

            bx = self.seg_list[self._pointer: self._pointer + self.seq_length]
            by = self.seg_list[self._pointer +
                           1: self._pointer + self.seq_length + 1]

            # update pointer position
            self._pointer += self.seq_length  

            # convert to ids
            bx = [self.char2id(c) for c in bx]
            by = [self.char2id(c) for c in by]

#             by = to_categorical(by, num_classes=self.vocab_size)[0]

            x_batches.append(bx)
            y_batches.append(by)
        
        return x_batches, y_batches

In [None]:
def define_model(args, data):

    # embedding
    inputs = Input(shape=(args.seq_length,))
    emb2 = Embedding(data.vocab_size, 50, mask_zero=True)(inputs)
    emb3 = LSTM(256)(emb2)

    # language model (decoder)
#     lm2 = LSTM(256)(emb3)
    lm3 = Dense(500, activation='relu')(emb3)
    outputs = Dense(vocab_size, activation='softmax')(lm3)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    plot_model(model, show_shapes=True, to_file='plot.png')

    return model

In [103]:
lyrics = "jaychou_lyrics_traditional.txt"
JIEBA_DICTFILE_PATH = "D:/Program/model/jieba/dict.txt.big.txt"
jieba.set_dictionary(JIEBA_DICTFILE_PATH) 

args = Param()
data = DataGenerator(lyrics, args)

Building prefix dict from D:\Program\model\jieba\dict.txt.big.txt ...
Loading model from cache C:\Users\user\AppData\Local\Temp\jieba.u5f50f2b7ab381204f5f6032f217b280e.cache
Loading model cost 1.322 seconds.
Prefix dict has been built succesfully.


Total length: 43933
Vocabulary Size: 5801


In [104]:
data.next_batch()

(1, 5801)


([[2192,
   2889,
   3806,
   1,
   2192,
   1182,
   535,
   5633,
   1686,
   1006,
   1,
   2192,
   1182,
   535,
   4566,
   1365,
   170,
   1,
   4566,
   1365]],
 [array([ 0.,  0.,  0., ...,  0.,  0.,  0.])])

In [None]:
model = define_model(args, data)