# Step 1. 데이터 다운로드

In [1]:
! wget https://aiffelstaticprd.blob.core.windows.net/media/documents/song_lyrics.zip

--2021-01-26 11:44:53--  https://aiffelstaticprd.blob.core.windows.net/media/documents/song_lyrics.zip
Resolving aiffelstaticprd.blob.core.windows.net (aiffelstaticprd.blob.core.windows.net)... 52.239.148.4
Connecting to aiffelstaticprd.blob.core.windows.net (aiffelstaticprd.blob.core.windows.net)|52.239.148.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2101791 (2.0M) [application/zip]
Saving to: ‘song_lyrics.zip’


2021-01-26 11:44:54 (11.8 MB/s) - ‘song_lyrics.zip’ saved [2101791/2101791]



In [2]:
! unzip song_lyrics.zip -d ~/aiffel/lyricist/data/lyrics

Archive:  song_lyrics.zip
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/Kanye_West.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/Lil_Wayne.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/adele.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/al-green.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/alicia-keys.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/amy-winehouse.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/beatles.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/bieber.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/bjork.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/blink-182.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/bob-dylan.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/bob-marley.txt  
  inflating: /home/aiffel-dj43/aiffel/lyricist/data/lyrics/britney-spears.txt  


# Step 2. 데이터 읽어오기

In [9]:
import glob
import os

txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

# 여러개의 txt 파일을 모두 읽어서 raw_corpus 에 담습니다.
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:10])

데이터 크기: 187088
Examples:
 ['They say get ready for the revolution', "I think it's time we find some sorta solution", "Somebody's caught up in the endless pollution", 'They need to wake up, stop living illusions I know you need to hear this', "Why won't somebody feel this", 'This is my wish that we all feel connected', 'This is my wish that nobodies neglected Be like a rocket baby', 'Be like a rocket Take off', 'Just fly, away (ay, ay)', 'To find your space Take off']


6-4에서 데이터를 불러올때는 **re, numpy, tensorflow**를 함께 가져왔는데 왜 여기서는 **glob**만 사용하는지 궁금하다.

### 연극 대사를 토큰화하면 화자와 공백이 포함되기 때문에 그 부분을 제거해야했는데 lyrics 데이터의 경우 문장 단위로 토큰화 되고 있기 때문에 다른 방식의 데이터 처리가 필요할 것으로 보인다.

# Step 3. 데이터 정제

1. 토큰화 했을 때 토큰의 개수가 15개를 넘어가는 문장을 학습데이터에서 제외하는 것을 권장  

2. preprocess_sentence()  함수 사용

### preprocess_sentence() 함수

In [10]:
import re                   
import numpy as np         
import tensorflow as tf    

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()       
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)        
    sentence = re.sub(r'[" "]+', " ", sentence)                  
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)  
    sentence = sentence.strip()

    sentence = '<start> ' + sentence + ' <end>'      
    return sentence

print(preprocess_sentence("This @_is ;;;sample        sentence."))   

<start> this is sample sentence . <end>


#### re를 import하지 않아서 처음에 오류가 났다.

In [11]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: continue
    if sentence[-1] == ":": continue
    
    hong = preprocess_sentence(sentence)
    if len(hong.split(' ')) >= 15: continue
    corpus.append(preprocess_sentence(sentence))
        
corpus[:10]

['<start> they say get ready for the revolution <end>',
 '<start> i think it s time we find some sorta solution <end>',
 '<start> somebody s caught up in the endless pollution <end>',
 '<start> why won t somebody feel this <end>',
 '<start> this is my wish that we all feel connected <end>',
 '<start> this is my wish that nobodies neglected be like a rocket baby <end>',
 '<start> be like a rocket take off <end>',
 '<start> just fly , away ay , ay <end>',
 '<start> to find your space take off <end>',
 '<start> just fly , away ay , ay <end>']

#### hong = preprocess_sentence(sentence)
####    if len(hong.split(' ')) >= 15: continue  

수희님의 도움으로 토큰이 15보다 큰 문장을 제외시킬수 있었다.

In [12]:
def tokenize(corpus):
   
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000,   
        filters=' ',    
        oov_token="<unk>"  
    )
    tokenizer.fit_on_texts(corpus)   
    
    tensor = tokenizer.texts_to_sequences(corpus)   

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')  

    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[  2  38  71 ...   0   0   0]
 [  2   4 131 ...   3   0   0]
 [  2 243  17 ...   0   0   0]
 ...
 [  2 146  50 ...   0   0   0]
 [  2 146   5 ...   0   0   0]
 [  2 146   5 ...   3   0   0]] <keras_preprocessing.text.Tokenizer object at 0x7fcf3226d050>


In [6]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])

    if idx >= 15: break

1 : <unk>
2 : <start>
3 : <end>
4 : i
5 : ,
6 : the
7 : you
8 : and
9 : a
10 : to
11 : it
12 : me
13 : my
14 : in
15 : that


In [13]:
src_input = tensor[:, :-1]  
tgt_input = tensor[:, 1:]    
print(src_input[0])
print(tgt_input[0])

[   2   38   71   43  302   28    6 3267    3    0    0    0    0]
[  38   71   43  302   28    6 3267    3    0    0    0    0    0]


In [14]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 13), (256, 13)), types: (tf.int32, tf.int32)>

# Step 4. 평가 데이터셋 분리

In [15]:
from sklearn.model_selection import train_test_split

enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, 
                                                    tgt_input, 
                                                    test_size=0.2, 
                                                    random_state=7)

print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)

Source Train: (119992, 13)
Target Train: (119992, 13)


# Step 5. 인공지능 만들기

In [16]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(TextGenerator, self).__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [17]:
for src_sample, tgt_sample in dataset.take(1): break
model(src_sample)

<tf.Tensor: shape=(256, 13, 12001), dtype=float32, numpy=
array([[[-2.09648235e-04, -1.07291664e-04, -2.19581143e-05, ...,
         -2.45101110e-04, -1.00813922e-04, -8.32458391e-05],
        [-4.70935425e-04, -2.70313059e-04,  9.80467462e-07, ...,
         -3.25021159e-04, -1.62004246e-04, -2.10396465e-04],
        [-5.36043313e-04, -3.76809738e-04,  1.01514030e-04, ...,
         -2.56084226e-04, -6.73801769e-05, -3.17077793e-04],
        ...,
        [-8.51757650e-04, -1.11408951e-03,  1.07909371e-04, ...,
         -1.09855307e-03, -5.11018909e-04, -8.11893377e-04],
        [-8.72444187e-04, -9.78386262e-04, -6.07385955e-05, ...,
         -1.27468607e-03, -5.61323250e-04, -1.11464551e-03],
        [-7.80551462e-04, -7.56925670e-04,  4.66928941e-05, ...,
         -1.13799341e-03, -6.64370542e-04, -1.49247202e-03]],

       [[-2.09648235e-04, -1.07291664e-04, -2.19581143e-05, ...,
         -2.45101110e-04, -1.00813922e-04, -8.32458391e-05],
        [-6.85223931e-05, -4.01761645e-05,  2

In [18]:
model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  3072256   
_________________________________________________________________
lstm (LSTM)                  multiple                  5246976   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense (Dense)                multiple                  12301025  
Total params: 29,012,961
Trainable params: 29,012,961
Non-trainable params: 0
_________________________________________________________________


In [53]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbed0094210>

## 최종적으로 loss값이 2.23이 나왔다.

Embedding Size, Hidden size 조절  

1. Embedding size 값이 커질수록 단어의 추상적인 특징들을 더 잡아낼 수 있지만, 그만큼 충분한 데이터가 주어지지 않으면 오히려 혼란만을 야기할 수 있습니다

2. Hidden_size 도 같은 맥락입니다. hidden_size는 모델에 얼마나 많은 일꾼을 둘 것인가? 로 이해해도 크게 엇나가지 않습니다. 그 일꾼들은 모두 같은 데이터를 보고 각자의 생각을 가지는데, 역시 충분한 데이터가 주어지면 올바른 결정을 내리겠지만 그렇지 않으면 배가 산으로 갈 뿐 입니다

### 어느정도부터 데이터가 많다고 판단할 수 있는지 기준을 알지 못하지만 이번 경우는 데이터가 꽤 있는걸로 생각하고 두 size 값을 키워보겠다.(학습 시간이 오래걸려 한 번만 해보겠습니다.)

In [19]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(TextGenerator, self).__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 512
hidden_size = 2048
model_1 = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [20]:
for src_sample, tgt_sample in dataset.take(1): break
model_1(src_sample)

<tf.Tensor: shape=(256, 13, 12001), dtype=float32, numpy=
array([[[ 2.99034091e-05,  3.85609659e-04, -2.31016107e-04, ...,
         -1.18975193e-04, -2.53126316e-04, -9.39094534e-05],
        [ 3.43869106e-05,  4.80053975e-04, -6.08364295e-04, ...,
         -2.68758845e-06, -3.42877349e-04,  1.07972146e-05],
        [ 3.62009014e-04,  4.66183876e-04, -6.94188289e-04, ...,
         -2.59679928e-05, -5.99983556e-04,  2.29985744e-04],
        ...,
        [ 1.46080274e-03,  4.59053525e-04, -1.22956373e-03, ...,
          4.38354909e-04, -1.57323061e-03, -7.28417886e-04],
        [ 1.47111702e-03,  5.36225503e-04, -1.16772542e-03, ...,
          1.03938673e-03, -1.89471396e-03, -1.47234323e-03],
        [ 1.42871123e-03,  5.58386382e-04, -1.08520349e-03, ...,
          1.61250029e-03, -2.21406342e-03, -2.20094249e-03]],

       [[ 2.99034091e-05,  3.85609659e-04, -2.31016107e-04, ...,
         -1.18975193e-04, -2.53126316e-04, -9.39094534e-05],
        [-3.28501657e-04,  7.35582551e-04,  9

In [21]:
model_1.summary()

Model: "text_generator_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  6144512   
_________________________________________________________________
lstm_2 (LSTM)                multiple                  20979712  
_________________________________________________________________
lstm_3 (LSTM)                multiple                  33562624  
_________________________________________________________________
dense_1 (Dense)              multiple                  24590049  
Total params: 85,276,897
Trainable params: 85,276,897
Non-trainable params: 0
_________________________________________________________________


In [22]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model_1.compile(loss=loss, optimizer=optimizer)
model_1.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fcf31101b10>

### 학습하는 동안 푹쉬다왔습니다... Size 값을 2배 올리니 loss값이 확연히 차이가 납니다.

# 가사생성

In [35]:
def generate_text(model_1, tokenizer, init_sentence="<start> i love", max_len=20):
    
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

     
    while True:
        predict = model_1(test_tensor)  
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1]    

        
        test_tensor = tf.concat([test_tensor, 
																 tf.expand_dims(predict_word, axis=0)], axis=-1)

        
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated   

In [37]:
generate_text(model_1, tokenizer, init_sentence="<start> i love", max_len=20)

'<start> i love you , i m not gonna crack <end> '