In [1]:
import numpy as np
import matplotlib.pyplot as plt
import string
from hanziconv import HanziConv

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model, to_categorical
from keras.models import Model
from keras.layers import Input, Dense, LSTM
from keras.layers import Embedding
from keras.models import load_model

from IPython.display import Image

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class Param():
    batch_size = 32
    n_epoch = 50
    seq_length = 20
    metadata = 'metadata.tsv'
    n_step = 3
    n_batches_per_epoch = 500
    lyrics_long = 360

    
    remove_word = ['!', '(', ')', '*', '+', ',', '-', '.',
                   '...', '......', '............', '/','<',
                   '>', '?','[', '\\', ']', '`','~', '·',
                   '…', '☆', '\u3000', '。', '〇', '《', '》',
                   '〖', '〗', 'ー', 'ㄇ', 'ㄈ', 'ㄌ', 'ㄒ', 'ㄙ','！',
                   'ㄚ', 'ㄟ', 'ㄡ','（','）','，','＜','＞','？','～']  

In [3]:
class DataGenerator():
    def __init__(self, datafiles, args):
        self.seq_length = args.seq_length
        self.batch_size = args.batch_size
        self.n_step = args.n_step
        
        self.remove_word = args.remove_word
        self.remove_word = ''.join(self.remove_word)        
        
        with open(datafiles, encoding='utf-8') as f:
            self.data = f.read()
        
        table = str.maketrans('','',self.remove_word)
        self.data = [w.translate(table) for w in self.data]
                
        # total data length
        self.total_len = len(self.data)  
        self.words = list(set(self.data))
        self.words.sort()
        print('Total length: {}'.format(self.total_len))
        
        # vocabulary
        self.vocab_size = len(self.words)  # vocabulary size
        print('Vocabulary Size:', self.vocab_size)
        
        # dictionary
        self.char2id_dict = {w: i for i, w in enumerate(self.words)}
        self.id2char_dict = {i: w for i, w in enumerate(self.words)}
        
        # pointer position to generate current batch
        self._pointer = 0
#         # save metadata file
#         self.save_metadata(args.metadata)
        
        self.max_iter = args.n_epoch * \
            (self.total_len // args.seq_length) // args.batch_size
        
    def char2id(self, c):
        return self.char2id_dict[c]
    
    def id2char(self, id):
        return self.id2char_dict[id]
    
    def save_metadata(self, file):
        with open(file, 'w', encoding="utf-8") as f:
            f.write('id\tchar\n')
            for i in range(self.vocab_size):
                c = self.id2char(i)
                f.write('{}\t{}\n'.format(i, c))
                
#     def create_tokenizer(self):
#         tokenizer = Tokenizer()
#         tokenizer.fit_on_texts(self.seg_list)
#         return tokenizer
    
    def next_batch(self):
        x_batches = []
        y_batches = []
        for i in range(self.batch_size):
            if self._pointer + self.seq_length + 1 >= self.total_len:
                self._pointer = 0

            bx = self.data[self._pointer: self._pointer + self.seq_length]
#             by = self.seg_list[self._pointer +
#                            1: self._pointer + self.seq_length + 1]
            by = self.data[self._pointer + self.seq_length]

            # update pointer position
            self._pointer += 1

            # convert to ids
            bx = [self.char2id(c) for c in bx]
            by = [self.char2id(by)]

            by = to_categorical(by, num_classes=self.vocab_size)[0]

            x_batches.append(bx)
            y_batches.append(by)
        
        return x_batches, y_batches
    
    
    def data_generator(self):
        while 1:
        # loop over photo identifiers in the dataset

            for i in range(0, self.max_iter, self.n_step):
                XSeq, y = list(), list()
                for j in range(i, min(self.max_iter, i+self.n_step)):

                    # generate input-output pairs
                    in_seq, out_word = self.next_batch()

                    for k in range(len(in_seq)):
                        XSeq.append(in_seq[k])
                        y.append(out_word[k])
                # yield this batch of samples to the model
                yield [np.array(XSeq), np.array(y)]

In [4]:
PATH = "D:/Program/dataset/lyrics/jay/"
ly = "JayLyrics.txt"
ly_tra = 'JayLyrics_traditional.txt'

args = Param()
data = DataGenerator(PATH+ly_tra, args)

Total length: 65697
Vocabulary Size: 2445


In [5]:
model = load_model("D:/Program/train_model/lyrics_generator/lyrics_model.h5")
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 20)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 20, 50)            122250    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               314368    
_________________________________________________________________
dense_1 (Dense)              (None, 500)               128500    
_________________________________________________________________
dense_2 (Dense)              (None, 500)               250500    
_________________________________________________________________
dense_3 (Dense)              (None, 2445)              1224945   
Total params: 2,040,563
Trainable params: 2,040,563
Non-trainable params: 0
_________________________________________________________________


In [7]:
# generate a description for an image
def generate_desc(model, args, data, in_text):
    # seed the generation process

    generated = ''
    generated += in_text
    l = len(in_text)

    # iterate over the whole length of the sequence
    for i in range(args.lyrics_long):
        # integer encode input sequence

        in_seq = [data.char2id(c) for c in in_text]

        # pad input
        sequence = pad_sequences([in_seq], maxlen=data.seq_length, padding='post')
        
        # predict next word
        yhat = model.predict(sequence, verbose=0)
        # convert probability to integer
        yhat = np.argmax(yhat)
        # map integer to word
        word = data.id2char(yhat)

        # append as input for generating the next word        
        generated += word
        
        if i < data.seq_length-l:
            in_text+=word
        
        else:
            in_text = in_text[1:]+word

    return generated

In [8]:
in_text = '你要離開我知道 今天昨天 一起來玩 半獸人'

gen = generate_desc(model, args, data,in_text)
print(gen)

你要離開我知道 今天昨天 一起來玩 半獸人 裝撞 
天涯颱上學的臉 太多人習慣路 我不會送雙過的姑裝
我是一個人事
雨過之後更難忘記
忘記我還愛你
你不用在意
流淚也隻是剛好閤意
我早已經待在榖底
我知道不能再留住你
也知道不能沒有孤寂
感激你讓我擁有缺點的美麗
看著那白色的蜻蜓
在空中忘瞭前進
還能不能 重新編織
腦海中起毛球的記憶
再說我愛你
可能雨也不會停
黑色毛衣
藏在音樂頻道
離開就讓你道的熱情
我的認真敗的消息
你會開始學其他同學
在書包寫東寫西
但我建議最好寫媽媽
我會用功讀書
用功讀書 怎麼會從我嘴巴說齣
不想你輸 所以要叫你用功讀書
媽媽織給你的毛衣 你要好好的收著
因為母親節到的時候我要告訴她我還留著
對瞭我會遇到瞭周潤發
所以你可以跟同學炫耀
賭神未來是你爸爸
我找不到 童年寫的情書
你寫完不要送人
因為過兩天你還是會把你們當


In [10]:
in_text = '讓我們 半獸人 的靈魂翻滾 收起殘忍'

gen = generate_desc(model, args, data,in_text)
print(gen)

讓我們 半獸人 的靈魂翻滾 收起殘忍 迴憶獸化的道
而我緊綳的外錶像上緊後的發條
等她的答案揭曉
她的睫毛彎的嘴角
無預警地對我笑
沒有預兆齣乎預料
竟它在灌木地 誰在閣樓上
冰冷的絕望
雨輕輕彈
硃紅色的窗
我一生在紙上
被風吹亂
夢在遠方
化成一縷香
隨風飄散你的模樣
菊花殘 滿地傷
你的笑容勉強不來
愛深埋珊瑚鳩
引下一整晚
你撐把小紙傘 如此溫熱親他
動作輕盈地圍繞
愛的甜味蔓延發酵
曖昧 愛你不捨
傻傻的城中
吵著吃糖
這故事一開始的鏡頭灰塵就已經遮蔽瞭陽光
呀 恐懼刻下瞭一個
你微笑瀏覽手機裏的浪漫
原來愛情可以來得這麼突然
短信的橋梁
怕你為你不需要我
所以你看去的事都有你
為你彈奏蕭邦的夜麯
紀念我死去的愛情
隨著北風
微微的笑 赤足又扭腰
朝著命運鑿齣的風
隻享受到嘴角
微微上翹
性感地無可救藥
想象不到 如此心跳
你的一切
