# 가사 생성 RNN 모델

## 데이터 읽어오기

In [1]:
from collections import defaultdict
from collections import Counter
import glob
import os
import re
import tensorflow as tf
import pickle

from sklearn.model_selection import train_test_split


txt_file_path = 'lyricist/data/lyrics/*'
txt_list = glob.glob(txt_file_path)

raw_corpus = []

for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])

데이터 크기: 187088
Examples:
 ['[Hook]', "I've been down so long, it look like up to me", 'They look up to me']


## 데이터 정제하기

- 소설과 달리 가사는 1절, 2절이 비슷한 경우가 많다. 따라서 중복 문장을 처리해줘야 한다.

### 특수 문자 확인

In [2]:
special_word = defaultdict(int)
alpha = [chr(x) for x in range(ord('a'), ord('z')+1)] + ["'", ",", ".", "!", "?", " "] + [str(i) for i in range(10)]

for raw in raw_corpus:
    raw = raw.lower()
    for r in raw:
        if r not in alpha:
            special_word[r] += 1

Counter.most_common(special_word)

[('(', 11202),
 (')', 11199),
 ('-', 8962),
 ('"', 2663),
 ('’', 2642),
 ('\t', 1708),
 ('[', 1606),
 (']', 1604),
 (';', 1110),
 (':', 712),
 ('*', 593),
 ('—', 413),
 ('|', 165),
 ('“', 159),
 ('”', 158),
 ('é', 88),
 ('í', 83),
 ('&', 77),
 ('á', 70),
 ('+', 67),
 ('‚', 55),
 ('_', 54),
 ('/', 52),
 ('`', 51),
 ('ä', 46),
 ('{', 43),
 ('}', 41),
 ('ú', 36),
 ('ó', 35),
 ('à', 35),
 ('¿', 28),
 ('è', 24),
 ('ß', 23),
 ('ð', 22),
 ('â', 20),
 ('¬', 19),
 ('ñ', 18),
 ('ù', 17),
 ('‘', 15),
 ('√', 14),
 ('ö', 14),
 ('ã', 12),
 ('ü', 11),
 ('$', 11),
 ('©', 10),
 ('ë', 8),
 ('ç', 7),
 ('#', 6),
 ('ê', 6),
 ('∑', 6),
 ('¨', 6),
 ('¡', 5),
 ('\ufeff', 5),
 ('–', 4),
 ('±', 4),
 ('>', 3),
 ('ì', 3),
 ('å', 3),
 ('´', 2),
 ('…', 2),
 ('õ', 2),
 ('%', 2),
 ('=', 2),
 ('ª', 1),
 ('@', 1),
 ('′', 1),
 ('″', 1),
 ('\xad', 1),
 ('ô', 1),
 ('«', 1),
 ('»', 1),
 ('≠', 1),
 ('ƒ', 1),
 ('¶', 1),
 ('≥', 1),
 ('þ', 1),
 ('ò', 1),
 ('¢', 1),
 ('†', 1),
 ('∆', 1),
 ('•', 1)]

### 정제 및 중복 문장 제거

메타문자: ^$.*+?=!:|\/()[]{}

In [3]:
corpus = set()
for raw in raw_corpus:
    raw = raw.lower().strip()
    raw = raw.replace('f***', 'fuck')
    raw = re.sub(r"(\[|\()(\w+)(\]|\))", r"", raw)
    raw = re.sub(r"\t\[\]\(\)-;", r" ", raw)
    raw = raw.replace('\ufeff', '')
    raw = re.sub(r'"‘“”\`', r"'", raw)
    raw = re.sub(r"([\?.!,'])", r" \1 ", raw)
    raw = re.sub(r'[" "]+', " ", raw)
    if len(set(list(raw)) - set(alpha)) > 0:
        continue
    
    if len(set(raw)) < 4:
        continue
    
    raw = raw.strip()
    raw = '<start> ' + raw + ' <end>'
    
    if len(raw.split()) > 16:
        continue
            
    corpus.add(raw)

corpus = list(corpus)

In [4]:
corpus[:10]

['<start> i know i got the green light <end>',
 '<start> i may have been only three <end>',
 '<start> ben franklin dizzy nigga <end>',
 '<start> i can show someone else <end>',
 '<start> and the darkness is lighter now <end>',
 '<start> ha , and i better come split it <end>',
 "<start> see me when i drop when i won ' t flop <end>",
 '<start> yeah im eatin but i got a tapeworm in my tummy oh <end>',
 '<start> my enemies want to be friends with my other enemies <end>',
 '<start> i looked at my haggard face in the bathroom light <end>']

In [5]:
len(corpus)

91611

### 사용된 단어 분석

In [6]:
words = defaultdict(int)
for c in corpus:
    for word in c.split():
        words[word] += 1

#### 총 단어 개수

In [7]:
len(words)

24394

#### 가장 많이 사용된 단어

In [8]:
from collections import Counter

Counter(words).most_common(100)

[('<start>', 91611),
 ('<end>', 91611),
 ("'", 28906),
 (',', 27425),
 ('i', 27380),
 ('the', 25191),
 ('you', 20323),
 ('and', 15138),
 ('a', 13050),
 ('to', 12964),
 ('it', 9613),
 ('me', 9305),
 ('my', 9055),
 ('in', 8244),
 ('that', 7079),
 ('s', 6390),
 ('of', 6053),
 ('t', 6036),
 ('.', 5939),
 ('on', 5699),
 ('your', 5546),
 ('we', 4951),
 ('like', 4677),
 ('is', 4565),
 ('all', 4524),
 ('m', 4006),
 ('be', 3911),
 ('for', 3902),
 ('with', 3857),
 ('so', 3855),
 ('but', 3702),
 ('up', 3581),
 ('just', 3421),
 ('can', 3342),
 ('know', 3292),
 ('they', 3239),
 ('this', 3205),
 ('got', 3114),
 ('she', 2991),
 ('when', 2956),
 ('love', 2940),
 ('what', 2887),
 ('no', 2846),
 ('?', 2844),
 ('get', 2718),
 ('he', 2691),
 ('was', 2690),
 ('don', 2688),
 ('do', 2585),
 ('now', 2421),
 ('if', 2384),
 ('out', 2251),
 ('baby', 2153),
 ('oh', 2049),
 ('go', 2045),
 ('re', 2038),
 ('her', 1985),
 ('!', 1977),
 ('down', 1962),
 ('there', 1952),
 ('one', 1894),
 ('yeah', 1891),
 ('see', 1831),

## 가사 생성 객체 정의

corpus와 Keras Model을 받아, 모델을 훈련하고, 결과를 반환하는 객체 정의

In [9]:
class GenerateLyrics():
    def __init__(self, corpus, Model):
        self.corpus = corpus
        self.tensor, self.tokenizer = self.tokenize(self.corpus)
        self.x, self.y = self.tensor[:, :-1], self.tensor[:, 1:]
        self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(self.x, self.y, test_size=0.2, random_state=42)
        self.BUFFER_SIZE = len(self.x)
        self.BATCH_SIZE = 256
        self.dataset, self.test_dataset = self.make_dataset(self.train_x, self.test_x, self.train_y, self.test_y)
        
        self.embedding_size = 256
        self.hidden_size = 1024
        self.model = Model(self.tokenizer.num_words + 1, self.embedding_size , self.hidden_size)
        
    def tokenize(self, corpus):
        tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=1000, 
        filters=' ',
        oov_token="<unk>"
        )
        tokenizer.fit_on_texts(corpus)
        tensor = tokenizer.texts_to_sequences(corpus)   
        tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')  

        return tensor, tokenizer
    
    def make_dataset(self, train_x, test_x, train_y, test_y):
        steps_per_epoch = self.BUFFER_SIZE // self.BATCH_SIZE

        VOCAB_SIZE = self.tokenizer.num_words + 1   

        dataset = tf.data.Dataset.from_tensor_slices((self.train_x, self.train_y))
        dataset = dataset.shuffle(self.BUFFER_SIZE)
        dataset = dataset.batch(self.BATCH_SIZE, drop_remainder=True)

        test_dataset = tf.data.Dataset.from_tensor_slices((self.test_x, self.test_y))
        test_dataset = test_dataset.shuffle(self.BUFFER_SIZE)
        test_dataset = test_dataset.batch(self.BATCH_SIZE, drop_remainder=True)
        return dataset, test_dataset
    
    
    def fit(self, epoch=5):
        optimizer = tf.keras.optimizers.Adam()
        loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True,
            reduction='none'
        )

        self.model.compile(loss=loss, optimizer=optimizer)
        hist = self.model.fit(self.dataset, epochs=epoch, validation_data=self.test_dataset)
        return hist
        
        
    def generate_text(self, init_sentence="<start>", max_len=20):
        test_input = self.tokenizer.texts_to_sequences([init_sentence])
        test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
        end_token = self.tokenizer.word_index["<end>"]

        while True:
            predict = self.model(test_tensor) 
            predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
            test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
            if predict_word.numpy()[0] == end_token:
                break
            if test_tensor.shape[1] >= max_len:
                break

        generated = ""
        for word_index in test_tensor[0].numpy():
            generated += self.tokenizer.index_word[word_index] + " "

        return generated

## 모델 정의

In [10]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True, recurrent_dropout=0.5)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True, recurrent_dropout=0.5)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out

### 그리드 서치

In [11]:
embedding_sizes = [64,128,256,512]
hidden_sizes = [256,512,1024,2408]

In [12]:
history = []
for e_size in embedding_sizes:
    for h_size in hidden_sizes:
        print('임베딩 사이즈: ', e_size, 'hidden size: ', h_size)
        model = GenerateLyrics(corpus, TextGenerator)
        model.embedding_size = e_size
        model.hidden_size = h_size
        hist = model.fit(10)
        result = model.generate_text(init_sentence="<start> i love", max_len=15)
        history.append((e_size, h_size, hist.history['loss'], hist.history['val_loss'], result))

임베딩 사이즈:  64 hidden size:  256
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
임베딩 사이즈:  64 hidden size:  512
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
임베딩 사이즈:  64 hidden size:  1024
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
임베딩 사이즈:  64 hidden size:  2408
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
임베딩 사이즈:  128 hidden size:  256
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
임베딩 사이즈:  128 hidden size:  512
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
임베딩 사이즈:  128 hidden size:  1024
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
file_name = "hist.pkl"
open_file = open(file_name, "wb")
pickle.dump(history, open_file)
open_file.close()

In [14]:
file_name = "hist.pkl"

open_file = open(file_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

In [19]:
hist2 = [(x[0], x[1], x[2][-1], x[3][-1], x[4]) for x in history]
hist2.sort(key=lambda x: x[3])
hist2

[(512,
  256,
  1.9578521251678467,
  2.0916693210601807,
  '<start> i love you so much <end> '),
 (128,
  1024,
  1.9552738666534424,
  2.0917131900787354,
  '<start> i love you so much <end> '),
 (512,
  512,
  1.9445419311523438,
  2.0930423736572266,
  '<start> i love you , i love you <end> '),
 (128,
  2408,
  1.952721357345581,
  2.09371018409729,
  '<start> i love you , i love you <end> '),
 (256,
  1024,
  1.9537256956100464,
  2.0939223766326904,
  '<start> i love you so much , i love you <end> '),
 (64,
  2408,
  1.9582383632659912,
  2.094468593597412,
  "<start> i love you , baby , i ' m <unk> ' <end> "),
 (64,
  256,
  1.955055832862854,
  2.0946004390716553,
  '<start> i love you so much <end> '),
 (256,
  2408,
  1.9518351554870605,
  2.0948731899261475,
  '<start> i love you , i love you <end> '),
 (128,
  256,
  1.9495058059692383,
  2.0952036380767822,
  '<start> i love you , i love you , i love you <end> '),
 (256,
  512,
  1.9534275531768799,
  2.0955021381378174,
 

임베딩 사이즈: 512, hidden size: 256일 때, 가장 낮은 Validation loss 2.092를 기록했습니다.

## 정리
1. 정규표현식을 활용하여 특수문자를 제거했습니다.
- 중복 문장을 제거했습니다. 
- 피처링을 표시하는 \[가수명\] 문자그룹을 제거했습니다.
- 기타 특수문자를 제거했습니다.
2. 가장 낮은 validation loss는 임베딩 사이즈: 512, hidden size: 256일 때, 가장 낮은 Validation loss 2.092를 기록했습니다.
3. 텍스트 제너레이션 결과는 위 항목과 같습니다. 가장 낮은 Validation loss를 기록했을 때의 텍스트 제너레이션 결과는 'i love' 입력 시, 'i love you so much' 입니다.