
## 1. 데이터 다운로드
---
Song Lyrics 다운로드

## 2. 데이터 읽어오기
---
glob 모듈을 사용해 모든 txt 파일을 읽어온 후, raw_corpus 리스트에 문장 단위로 저장

In [1]:
import glob
import os

txt_file_path = os.getenv('HOME') + '/project/aiffel-lms/E11_Writer/data/lyrics/*'
txt_list = glob.glob(txt_file_path)

raw_corpus = []

for txt_file in txt_list:
    with open(txt_file, 'r') as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print('데이터 크기:', len(raw_corpus))
print('Example:\n', raw_corpus[:20])

데이터 크기: 187088
Example:
 ['The Cat in the Hat', 'By Dr. Seuss', 'The sun did not shine.', 'It was too wet to play.', 'So we sat in the house', 'All that cold cold wet day.', 'I sat there with Sally.', 'We sat there we two.', 'And I said How I wish', 'We had something to do!', 'Too wet to go out', 'And too cold to play ball.', 'So we sat in the house.', 'We did nothing at all.', 'So all we could do was to', 'Sit!', 'Sit!', 'Sit!', 'Sit!', 'And we did not like it.']


## 3. 데이터 정제
---
preprocess_sentence()를 이용해 데이터를 정제한다   

문장을 토큰화 했을 때, 토큰의 개수가 15개를 넘어가면 잘라낸다.

In [2]:
import re
import numpy as np
import tensorflow as tf
import os

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)  
    sentence = re.sub(r'[" "]+'," ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence) 
    
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    
    return sentence

print(preprocess_sentence("This @_is ;;;sample        sentence."))

<start> this is sample sentence . <end>


In [3]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0:continue
    if len(sentence.split()) >= 15:continue
        
    corpus.append(preprocess_sentence(sentence))

corpus[:10]

['<start> the cat in the hat <end>',
 '<start> by dr . seuss <end>',
 '<start> the sun did not shine . <end>',
 '<start> it was too wet to play . <end>',
 '<start> so we sat in the house <end>',
 '<start> all that cold cold wet day . <end>',
 '<start> i sat there with sally . <end>',
 '<start> we sat there we two . <end>',
 '<start> and i said how i wish <end>',
 '<start> we had something to do ! <end>']

## 4. 평가 데이터셋 분리
---
tokenize() 함수로 데이터를 Tensor로 변환한 후, sklearn 모듈의 train_test_split() 함수를 사용해 훈련 데이터와 평가 데이터를 분리한다.   
단어장의 크기는 12,000이상으로 한다.   
총 데이터의 20%를 평가 데이터 셋으로 사용한다.


In [4]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=15000,
        filters='',
        oov_token='<unk>'    
    )
    tokenizer.fit_on_texts(corpus)
    
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    print(tensor, tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

print(tensor[:3, :10])

[[   2    6  860 ...    0    0    0]
 [   2  119 2608 ...    0    0    0]
 [   2    6  298 ...    0    0    0]
 ...
 [   2  665   27 ...    0    0    0]
 [   2  665   27 ...    0    0    0]
 [   2  665   27 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7f74930ce250>
[[    2     6   860    14     6  1261     3     0     0     0]
 [    2   119  2608    19 12983     3     0     0     0     0]
 [    2     6   298   167    68   548    19     3     0     0]]


In [5]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])
    
    if idx >=10:break

1 : <unk>
2 : <start>
3 : <end>
4 : ,
5 : i
6 : the
7 : you
8 : and
9 : a
10 : to


In [6]:
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]

print(src_input[0])
print(tgt_input[0])

[   2    6  860   14    6 1261    3    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[   6  860   14    6 1261    3    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [7]:
from sklearn.model_selection import train_test_split

enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, tgt_input, test_size=0.2, random_state=32)

print('Source Train:', enc_train.shape)
print('Target Train:', dec_train.shape)


Source Train: (133060, 32)
Target Train: (133060, 32)


In [8]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE
VOCAB_SIZE = tokenizer.num_words + 1

train_dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train)).shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices((enc_val, dec_val)).shuffle(BUFFER_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)

## 5. 인공지능 만들기
---
모델의 Embedding Size와 Hidden Size를 조절하며 10 Epoch 안에 val_loss 값을 2.2 수준으로 줄일 수 있는 모델을 설계한다.

In [9]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(TextGenerator, self).__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
    
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out

embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words+1, embedding_size, hidden_size)

In [10]:
for src_sample, tgt_sample in train_dataset.take(1):
    break
model(src_sample)

<tf.Tensor: shape=(256, 32, 15001), dtype=float32, numpy=
array([[[-7.64637953e-05,  6.24688982e-05,  1.09397413e-04, ...,
          9.83872087e-06, -1.41840384e-04, -1.86359379e-04],
        [-1.15518182e-04, -1.04031778e-05, -3.38971258e-05, ...,
         -1.52637862e-04, -3.00909829e-04, -4.65017583e-05],
        [-1.16822783e-04,  2.31268386e-05, -2.54478746e-05, ...,
         -4.23823280e-04, -2.63981870e-04, -6.03224507e-05],
        ...,
        [-1.31535274e-03,  3.80153069e-04, -1.23258773e-03, ...,
         -9.83388047e-04, -8.59262247e-04,  4.46289405e-03],
        [-1.32649764e-03,  3.66144493e-04, -1.23833166e-03, ...,
         -1.02334307e-03, -8.98969825e-04,  4.49541863e-03],
        [-1.33282831e-03,  3.52975185e-04, -1.24374777e-03, ...,
         -1.05662341e-03, -9.32204479e-04,  4.52210521e-03]],

       [[-7.64637953e-05,  6.24688982e-05,  1.09397413e-04, ...,
          9.83872087e-06, -1.41840384e-04, -1.86359379e-04],
        [ 3.57111894e-05, -1.85553363e-05,  4

In [11]:
model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  3840256   
_________________________________________________________________
lstm (LSTM)                  multiple                  5246976   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense (Dense)                multiple                  15376025  
Total params: 32,855,961
Trainable params: 32,855,961
Non-trainable params: 0
_________________________________________________________________


In [12]:
optimizer = tf.keras.optimizers.Adam()
#Loss
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

model.compile(loss=loss, optimizer=optimizer)
model.fit(x=enc_train, y=dec_train, validation_data=(enc_val, dec_val), batch_size=BATCH_SIZE, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f74912944d0>

In [13]:

def generate_text(model, tokenizer, init_sentence='<start> i love', max_len=20):
    
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index['<end>']
    
    while True:
        predict = model(test_tensor)
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:,-1]
        
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        
        if predict_word.numpy()[0] == end_token:break
        if test_tensor.shape[1] >= max_len:break
    
    generated = ""
    
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "
    
    return generated

generate_text(model, tokenizer, init_sentence='<start> i love', max_len=20)

'<start> i love you , i m better , i m better , i m better , i m better '

In [14]:
generate_text(model, tokenizer, init_sentence='<start> i love you', max_len=20)

'<start> i love you , i m better , i m better , i m better , i m better '

In [15]:
generate_text(model, tokenizer, init_sentence='<start> i hate', max_len=20)

'<start> i hate the way you lie <end> '

In [16]:
generate_text(model, tokenizer, init_sentence='<start> if you', max_len=20)

'<start> if you re ready to make it <end> '

In [17]:
generate_text(model, tokenizer, init_sentence='<start> i wonder', max_len=20)

'<start> i wonder if you re in the mirror <end> '

In [18]:
generate_text(model, tokenizer, init_sentence='<start> if you want', max_len=20)

'<start> if you want it , baby <end> '

In [19]:
generate_text(model, tokenizer, init_sentence='<start> if you are sad', max_len=20)

'<start> if you are sad <end> '

### Report
---
토큰 문장의 길이를 15로 제한할 때, 직접 자르지 않고 tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post') 에서 maxlen을 15로 줬을 때 제대로 학습되지 않는 모습을 보였다.

그리고 I love you 라는 문장을 주자 I'm better 를 반복하는데 왜 그러는지....? 