# 멋진 작사가 만들기

## 데이터 준비하기

### GoogleDrive 마운트


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 가사 데이터 불러오기

In [2]:
import glob
import os
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split

data_path = "/content/drive/MyDrive/aiffel/ex04/lyrics/*"

txt_file_list = glob.glob(data_path)

raw_corpus = []

for file in txt_file_list:
    with open(file, "r") as f:
        line = f.read().splitlines()
        raw_corpus.extend(line)
        
raw_corpus[:20]

['The first words that come out',
 'And I can see this song will be about you',
 "I can't believe that I can breathe without you",
 'But all I need to do is carry on',
 'The next line I write down',
 "And there's a tear that falls between the pages",
 "I know that pain's supposed to heal in stages",
 "But it depends which one I'm standing on I write lines down, then rip them up",
 "Describing love can't be this tough I could set this song on fire, send it up in smoke",
 'I could throw it in the river and watch it sink in slowly',
 'Tie the pages to a plane and send it to the moon',
 "Play it for the world, but it won't mean much",
 "Unless I sing this song to you I'm dying to show you",
 'This could end happily ever after',
 "There doesn't ever have to be disaster",
 'And all you have to do is sing along I write lines down, then rip them up',
 'Impossible describing love I could set this song on fire, send it up in smoke',
 'I could throw it in the river and watch it sink in slowly',
 

### 텍스트 전처리

In [3]:
def preprocess_sentence(sentence):
    sentence = re.sub(r"([()?.!,¿])", r" \1 ", sentence) 
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿']+", " ", sentence)
    sentence = sentence.strip()
    sentence = "<start> " + sentence + " <end>"
    return sentence

In [4]:

corpus = []

for sentence in raw_corpus:

    if len(sentence) == 0: continue
    
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)
        
corpus[:20]

['<start> The first words that come out <end>',
 '<start> And I can see this song will be about you <end>',
 "<start> I can't believe that I can breathe without you <end>",
 '<start> But all I need to do is carry on <end>',
 '<start> The next line I write down <end>',
 "<start> And there's a tear that falls between the pages <end>",
 "<start> I know that pain's supposed to heal in stages <end>",
 "<start> But it depends which one I'm standing on I write lines down , then rip them up <end>",
 "<start> Describing love can't be this tough I could set this song on fire , send it up in smoke <end>",
 '<start> I could throw it in the river and watch it sink in slowly <end>',
 '<start> Tie the pages to a plane and send it to the moon <end>',
 "<start> Play it for the world , but it won't mean much <end>",
 "<start> Unless I sing this song to you I'm dying to show you <end>",
 '<start> This could end happily ever after <end>',
 "<start> There doesn't ever have to be disaster <end>",
 '<start> 

In [5]:
sum([len(x.split(" ")) for x in corpus])/len(corpus)


10.211579330173992

한 문장에 평균 10단어 정도 사용됨

In [6]:
# 한 문장에 학습할 단어 수를 <start> <stop> 포함 17 단어로 제한
corpus = [x for x in corpus if len(x.split())<(17)]

### 토큰화

In [7]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=10000, 
        filters=' ',
        oov_token="<unk>"
    )

    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)   
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')  
    
    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[  2   5 252 ...   0   0   0]
 [  2   8   6 ...   0   0   0]
 [  2   6  86 ...   0   0   0]
 ...
 [  2 240   1 ...   0   0   0]
 [  2  10 511 ...   0   0   0]
 [  2 122  18 ...   0   0   0]] <keras_preprocessing.text.Tokenizer object at 0x7fbc3edd7b10>


In [8]:
src_input = tensor[:, :-1]  
tgt_input = tensor[:, 1:]    
print(src_input[0])
print(tgt_input[0])

[  2   5 252 445  15  63  51   3   0   0   0   0   0   0   0]
[  5 252 445  15  63  51   3   0   0   0   0   0   0   0   0]


## 학습시키기

### train / val 데이터 분리

In [9]:
from sklearn.model_selection import train_test_split

enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, 
                                                          tgt_input, 
                                                          test_size=0.2, 
                                                          shuffle=True, 
                                                          random_state=12)

print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)

Source Train: (130202, 15)
Target Train: (130202, 15)


### 데이터셋 생성

In [10]:
BUFFER_SIZE = len(enc_train)
BATCH_SIZE = 64
steps_per_epoch = len(enc_train) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1   

dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 15), (64, 15)), types: (tf.int32, tf.int32)>

### 모델 생성

In [11]:
class TextGenerator(tf.keras.Model):

    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.dropput_1 = tf.keras.layers.Dropout(0.2)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.dropput_2 = tf.keras.layers.Dropout(0.2)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.dropput_1(out)
        out = self.rnn_2(out)
        out = self.dropput_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

### 모델 학습

In [12]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=10, validation_data=(enc_val, dec_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbc3dd8d6d0>

### 모델 검증

In [13]:
result = model.evaluate(x=enc_val, y=dec_val)

result



2.182461738586426

## 가사 만들어보기

In [14]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    # 테스트를 위해서 입력받은 init_sentence도 텐서로 변환합니다
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    # 단어 하나씩 예측해 문장을 만듭니다
    #    1. 입력받은 문장의 텐서를 입력합니다
    #    2. 예측된 값 중 가장 높은 확률인 word index를 뽑아냅니다
    #    3. 2에서 예측된 word index를 문장 뒤에 붙입니다
    #    4. 모델이 <end>를 예측했거나, max_len에 도달했다면 문장 생성을 마칩니다
    while True:
        # 1
        predict = model(test_tensor) 
        # 2
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        # 3 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        # 4
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    # tokenizer를 이용해 word index를 단어로 하나씩 변환합니다 
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [30]:
generate_text(model, tokenizer, init_sentence="<start> you")

'<start> you know you gotta give it your all <end> '