In [27]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing import text, sequence
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import re
from glob import glob


#### Read Data

In [28]:
# lyrics data 가져오기

lyrics_path = '/content/drive/MyDrive/Colab Notebooks/aiffel_lms/E4_lyrics/lyrics/*'
lyrics_list = glob(lyrics_path)

In [29]:
lyrics_path

'/content/drive/MyDrive/Colab Notebooks/aiffel_lms/E4_lyrics/lyrics/*'

In [30]:
# lyrics Data list화

lyrics_corpus = []

for file in lyrics_list:
  with open(file, 'r') as f:
    lyrics_sentence = f.read().splitlines()
    lyrics_corpus.extend(lyrics_sentence)


#### Preprocessing

In [31]:
# 문장 전처리 필요 부분 확인
lyrics_corpus[:100]

['Looking for some education',
 'Made my way into the night',
 'All that bullshit conversation',
 "Baby, can't you read the signs? I won't bore you with the details, baby",
 "I don't even wanna waste your time",
 "Let's just say that maybe",
 'You could help me ease my mind',
 "I ain't Mr. Right But if you're looking for fast love",
 "If that's love in your eyes",
 "It's more than enough",
 'Had some bad love',
 "So fast love is all that I've got on my mind Ooh, ooh",
 'Ooh, ooh Looking for some affirmation',
 'Made my way into the sun',
 'My friends got their ladies',
 "And they're all having babies",
 "I just wanna have some fun I won't bore you with the details, baby",
 "I don't even wanna waste your time",
 "Let's just say that maybe",
 'You could help me ease my mind',
 "I ain't Mr. Right But if you're looking for fast love",
 "If that's love in your eyes",
 "It's more than enough",
 "I've had some bad love",
 "So fast love is all that I've got on my mind Ooh, ooh",
 'Baby, baby',

* 특수문자 제거 필요 ,."'[ ]
* [ ]안에 문자 들어있는 부분도 삭제 필요 
* 축약어 'll, 'm, 't, 'd 등 정리필요

In [32]:
# string_process function version #1

# string 전처리 function 
# 가사 중 축약어 't, 'll, 'd, 'm 같은 단어는 '없이 ex) can't -> cant로 변경함 
# [clean version:], [explicit version:]과 같은 version 내용도 삭제함 

def string_process(lyrics_corpus):
  lyrics_clean = lyrics_corpus.lower()
  lyrics_clean = lyrics_clean.replace('[clean version:]', ' ')
  lyrics_clean = lyrics_clean.replace('[explicit version:]', ' ')
  lyrics_clean = lyrics_clean.replace('\'t' , 't')
  lyrics_clean = lyrics_clean.replace('\'ll' , 'll')
  lyrics_clean = lyrics_clean.replace('\'m' , 'm')
  lyrics_clean = lyrics_clean.replace('\'d' , 'd')
  lyrics_clean = re.sub(r'[^a-zA-Z0-9]', ' ', lyrics_clean)
  lyrics_clean = re.sub(r'[" "]+',' ', lyrics_clean)
  lyrics_clean = lyrics_clean.strip()
  lyrics_clean = '<start> ' + lyrics_clean + ' <end>'
  return lyrics_clean


In [33]:
# string 전처리 확인
print(string_process('HI, this*.[] is JISOO LEE :) i\'ll play with you'))
print(string_process(lyrics_corpus[98]))
print(type(string_process(lyrics_corpus[0]))) #str 타입

<start> hi this is jisoo lee ill play with you <end>
<start> go head and sell me out and ill lay your shit bare <end>
<class 'str'>


In [34]:
# string_process function version #2

# Dictionary로 삭제하고자 하는 문장 모아서 replace_all
dic = {'[Clean version:]' :  ' ' , '[Explicit version:]' : ' ','\'t' : 't', '\'ll' : 'll', '\'m' : 'm', '\'d' : 'd'}

def replace_all(lyrics_corpus, dic):
  for i, j in dic.items():
    lyrics_corpus = lyrics_corpus.replace(i, j)
  return lyrics_corpus

def string_process_2(lyrics_corpus):
  lyrics_clean = replace_all(lyrics_corpus, dic)
  lyrics_clean = lyrics_clean.lower()
  lyrics_clean = re.sub(r'[^a-zA-Z0-9]', ' ', lyrics_clean)
  lyrics_clean = re.sub(r'[" "]+',' ', lyrics_clean)
  lyrics_clean = lyrics_clean.strip()
  lyrics_clean = '<start> ' + lyrics_clean + ' <end>'
  return lyrics_clean


In [35]:
print(string_process_2('HI, this*3 is JISOO LEE :)'))
print(string_process_2(lyrics_corpus[98]))
print(type(string_process_2(lyrics_corpus[0]))) #str 타입

<start> hi this 3 is jisoo lee <end>
<start> go head and sell me out and ill lay your shit bare <end>
<class 'str'>


>> 오류 #1

In [36]:
# def del_text(lyrics_corpus):
#   dic = {'[Clean version:]' :  '' , '[Explicit version:]' : ''}
#   trans = lyrics_corpus.maketrans(dic)
#   lyrics_corpus = lyrics_corpus.translate(trans)
#   return lyrics_corpus

- 위의 것이 안되는 이유는 str.translate 의 key 값은 꼭 single character이어야 하기 때문에
- str.translate can be used solely to replace single characters
- The replacement strings can be of any length, but the keys must be a single character


Anyway, Preprocessing Functions are Made

In [37]:
lyrics_clean_corpus = []

for lyrics_corp in lyrics_corpus:
  if len(lyrics_corp) == 0:
    continue
  
  lyrics_clean_corp = string_process(lyrics_corp)
  lyrics_clean_corpus.append(lyrics_clean_corp)

print(lyrics_clean_corpus[:10])


['<start> looking for some education <end>', '<start> made my way into the night <end>', '<start> all that bullshit conversation <end>', '<start> baby cant you read the signs i wont bore you with the details baby <end>', '<start> i dont even wanna waste your time <end>', '<start> let s just say that maybe <end>', '<start> you could help me ease my mind <end>', '<start> i aint mr right but if you re looking for fast love <end>', '<start> if that s love in your eyes <end>', '<start> it s more than enough <end>']


#### Tokenizing

In [38]:
def tokenize(lyrics_clean_corpus):
  #14,000단어 기억하는 tokenizer 생성
  tokenizer = text.Tokenizer(num_words = 14000, oov_token = '<oov>')

  # corpus 이용해 tokenizer 내부 단어장 생성
  tokenizer.fit_on_texts(lyrics_clean_corpus)

  # corpus to tensor
  tensor = tokenizer.texts_to_sequences(lyrics_clean_corpus)

  # padding
  # 과도한 padding 막기위해 max_len에 15 지정
  tensor = sequence.pad_sequences(tensor, padding = 'post', maxlen=15)

  print(tensor, tokenizer)
  return tensor, tokenizer

# 결과 저장 
tensor, tokenizer = tokenize(lyrics_clean_corpus)

[[  3 302  25 ...   0   0   0]
 [  3 215  12 ...   0   0   0]
 [  3  21  15 ...   0   0   0]
 ...
 [  3  29  19 ...   0   0   0]
 [  3   3  19 ...   2   0   0]
 [  3  38 125 ...   0   0   0]] <keras_preprocessing.text.Tokenizer object at 0x7efcf4a87cd0>


In [80]:
print(tokenizer.word_index)

# 마지막 출력층에서 오류가 나서 확인해보니 start와 end에 <>가 없음을 확인



In [39]:
# Tensor를 뒷단어 제외(source) 첫 단어 제외(target)으로 재조정

source_input = tensor[:, :-1] 
target_input = tensor[:, 1:]

In [40]:
# Data shape 확인

print(tensor.shape)
print(source_input.shape)
print(target_input.shape)

(175986, 15)
(175986, 14)
(175986, 14)


#### Data Split & to.Dataset

In [41]:
x_train, x_test, y_train, y_test = train_test_split(source_input, target_input, 
                                                    test_size = 0.2, random_state = 10)

In [42]:
print('Source Train: ', x_train.shape)
print('Target Train: ', y_train.shape)
print('Source Test: ', x_test.shape)
print('Target Test: ', y_test.shape)

print(tensor.shape)
# 124960보다 넘긴하는데 문제 없을거라 생각됨
# 아마 'll 같은 축약어 문제가 아닐까싶음

Source Train:  (140788, 14)
Target Train:  (140788, 14)
Source Test:  (35198, 14)
Target Test:  (35198, 14)
(175986, 15)


In [43]:
# Tensor to Dataset

buffer_size = len(source_input)
batch_size = 256
steps_per_epoch = len(source_input) // batch_size

# tokenizer가 구축한 14000개에 padding 값 0까지 더해서 14001개
vocab_size = tokenizer.num_words +1
print(vocab_size)

# train_tensor to Dataset
dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)) 
dataset_train = dataset_train.shuffle(buffer_size)
dataset_train = dataset_train.batch(batch_size, drop_remainder = True)
print(dataset_train)

# test_tensor to Dataset
dataset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)) 
dataset_test = dataset_test.shuffle(buffer_size)
dataset_test = dataset_test.batch(batch_size, drop_remainder = True)
print(dataset_test)

#src_input과 tgt_input 크기가 (256,14)이 됨 (batch_size, Dataset 객체로 변환된 src_input 크기)
#batch size는 output의 일부와도 같겠지 

14001
<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>
<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>


#### Modeling

In [44]:

class TextGenerator(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, hidden_size):
    super().__init__()

    # 입력 tensor에는 단어사전의 인덱스가 있고. Embedding layer -> 이 인덱스 값은 해당 인덱스번째의 워드 벡터로 바꿔줘
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size) 
    self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences = True)
    self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences = True)
    self.linear = tf.keras.layers.Dense(vocab_size)

  def call(self, x):
    out = self.embedding(x)
    out = self.rnn_1(out)
    out = self.rnn_2(out)
    out = self.linear(out)

    return out


embedding_size = 512
hidden_size = 1024
model = TextGenerator(tokenizer.num_words +1, embedding_size, hidden_size)



#### Model Compile

In [45]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')

model.compile(loss = loss, optimizer = optimizer)
model.fit(dataset_train, epochs=30) # dataset에 source 및 output 다 있으니까 한번에 처리 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7efcf5cbb690>

In [81]:
# 텍스트 생성 함수

def generate_text(model, tokenizer, init_sentence='start', max_len=15):
    # 테스트를 위해서 입력받은 init_sentence도 텐서로 변환
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index['end']

    # 단어를 하나씩 예측해서 문장을 만듦
    # 1. 입력받은 문장의 tensor 확인
    # 2. 예측된 값 중 가장 높은 확률인 word index를 뽑아 
    # 3. 2에서 예측된 word index를 문장 뒤에 붙여
    # 4. 모델이 <end>를 예측했거나 max_len에 도달하면 문장 생성을 끝냄 
    while True:
        predict = model(test_tensor) 
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        if predict_word.numpy()[0] == end_token: 
          break
        if test_tensor.shape[1] >= max_len: 
          break

    generated = ""
    #tokenizer 이용해 word index를 단어로 하나씩 변환 
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated




#### Generate Sentence 

In [83]:
generate_text(model, tokenizer, init_sentence='start i love', max_len=15)

'start i love you end '

#### TODO 
    * 마지막 출력 내용의 start와 end를 삭제해야 하는데, `model.save` 및 `tf.keras.callbacks.ModelCheckpoint`을 사용하지 않아서 시간적 비효율성으로 인해 진행하지 않음
    * 전처리 과정에서 [ ]안의 str도 삭제하기 위한 `lyrics_clean = re.sub(r'[\[^)*\]]', ' ', lyrics_clean)` 코드가 왜 작동하지 않는지 아직 확인하지 못했음
    * 모델링 과정에서 결과로 나오는 데이터의 출력값 및 재사용되는 입력값이 tf.int64와 tf.int32로 나뉘는데 해당부분 더 확인하여 이해도 up