# 데이터 불러오기

In [None]:
import glob
import os
import re
from tensorflow import keras
import tensorflow as tf

In [2]:
# 해당 경로에 있는 모든 파일을 불러오기
txt_file_path = os.getenv('HOME')+'/data/lyrics/*'

# glob.glob() 함수는 파라미터에 명시된 저장 경로와 패턴에 해당하는 파일명을 리스트 형식으로 반환한다.
# 참고 - https://velog.io/@dkwjd131/Python-glob.glob-%ED%95%A8%EC%88%98-%EC%82%AC%EC%9A%A9%ED%95%98%EA%B8%B0
txt_list = glob.glob(txt_file_path)

print("파일 개수 : ",len(txt_list))
print("파일 경로 : ",txt_list[0])

파일 개수 :  49
파일 경로 :  /aiffel/data/lyrics/janisjoplin.txt


In [3]:
raw_corpus = []

for txt_file in txt_list:
    with open(txt_file, 'r') as f:
        
        # splitlines()는 가져온 파일의 모든 문자열을 하나의 문자열로 만들어줌
        raw = f.read().splitlines()
        
        #raw_corpus.append(raw)
        #이렇게 하니까 모든 문자 내용이 나옴. 슬라이스로 범위가 정해졌을 텐데, 왜 그럴까요?
        
        raw_corpus.extend(raw)
print("데이터 크기: ", len(raw_corpus))
print("샘플:\n", raw_corpus[:3])

데이터 크기:  187088
샘플:
 ["Busted flat in Baton Rouge, waitin' for a train", "And I's feelin' near as faded as my jeans", 'Bobby thumbed a diesel down, just before it rained']


# 데이터 가공하기

In [4]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    
    # 역물음표는 의무문의 첫부분에 사용한다. 문장 중간에도 사용할 수 있다. 이때는 의무문이 시작되는 구역을 말해준다.
    # 참고 자료 - 위키백과
    sentence = re.sub(r'([?.!,¿])', r' \1 ', sentence)
    
    # 부호 더하기는 무엇을 의미하는가?
    sentence = re.sub(r'[" "]+', " ", sentence)
    
    sentence = re.sub(r'[^a-zA-Z?.!,¿]+', " ", sentence)
    sentence = sentence.strip()
    return sentence

In [5]:
corpus = []

for sentence in raw_corpus:
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)
    
corpus[:3]

['busted flat in baton rouge , waitin for a train',
 'and i s feelin near as faded as my jeans',
 'bobby thumbed a diesel down , just before it rained']

In [6]:
# 사용한 단어를 보려고요.
# 참고 - https://webisfree.com/2017-11-20/python-list%EA%B0%92-unique%ED%95%9C-%EA%B3%A0%EC%9C%A0%EA%B0%92%EB%A7%8C-%EA%B0%80%EC%A7%80%EB%8A%94-%EB%B0%A9%EB%B2%95
unique_corpus = set(corpus)
print(len(unique_corpus))

116823


In [7]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=116823, filters=' ')
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    print(tensor, tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[3606 1689   11 ...    0    0    0]
 [   5    2   13 ...    0    0    0]
 [ 801 7661    6 ...    0    0    0]
 ...
 [   2   19  696 ...    0    0    0]
 [   2   19  696 ...    0    0    0]
 [   2   19  696 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7f3a55aa80d0>


In [8]:
print(tensor[:3, :10])

[[3606 1689   11 7055 3992    1 1097   25    6  678]
 [   5    2   13  510  845   78 2586   78   10  945]
 [ 801 7661    6 6045   57    1   33  182    8 4572]]


In [9]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])
    
    if idx >= 10: break

1 : ,
2 : i
3 : the
4 : you
5 : and
6 : a
7 : to
8 : it
9 : me
10 : my


In [11]:
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]

print(src_input[0])
print(tgt_input[0])

[3606 1689   11 7055 3992    1 1097   25    6  678    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [15]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 344), (256, 344)), types: (tf.int32, tf.int32)>

In [16]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words+1, embedding_size, hidden_size)

In [17]:
for src_sample, tgt_sample in dataset.take(1): break
    
model(src_sample)

# 다 읽지 않았으나 
# https://iambeginnerdeveloper.tistory.com/69
# 여기서 대충 읽어보면 입력 데이터의 사이즈를 줄여보라고 합니다
# 그래서 줄이기로 합니다.

ResourceExhaustedError: OOM when allocating tensor with shape[88064,116824] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:MatMul]

In [18]:
def tokenize(corpus):
    # 단어 개수를 LSM에 나오는 대로 7천개로 수정합니다.
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=7000, filters=' ')
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    print(tensor, tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[3606 1689   11 ...    0    0    0]
 [   5    2   13 ...    0    0    0]
 [ 801    6 6045 ...    0    0    0]
 ...
 [   2   19  696 ...    0    0    0]
 [   2   19  696 ...    0    0    0]
 [   2   19  696 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7f3a4f5cc5e0>


In [19]:
# 다시 코드를 쓰고

for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])
    
    if idx >= 10: break
        
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]

BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words+1, embedding_size, hidden_size)

1 : ,
2 : i
3 : the
4 : you
5 : and
6 : a
7 : to
8 : it
9 : me
10 : my


In [20]:
for src_sample, tgt_sample in dataset.take(1): break
    
model(src_sample)

<tf.Tensor: shape=(256, 341, 7001), dtype=float32, numpy=
array([[[ 1.22533529e-04,  2.53056380e-04,  3.58988153e-04, ...,
         -1.65925856e-04, -1.43689913e-05,  5.41610780e-06],
        [ 1.98013913e-05,  3.75200296e-04,  5.86441893e-04, ...,
          5.58174543e-06,  1.93049578e-04,  1.70681684e-04],
        [-2.57018226e-04,  5.28614735e-04,  8.82467022e-04, ...,
          2.98110797e-04,  1.40572040e-04,  1.87408714e-05],
        ...,
        [ 2.40815469e-04, -1.06788275e-05, -3.40687315e-04, ...,
          2.13755717e-04,  4.18205280e-03,  4.53956984e-03],
        [ 2.40815818e-04, -1.06780999e-05, -3.40687373e-04, ...,
          2.13755600e-04,  4.18205233e-03,  4.53957031e-03],
        [ 2.40814188e-04, -1.06781436e-05, -3.40686966e-04, ...,
          2.13755950e-04,  4.18205187e-03,  4.53957031e-03]],

       [[-4.97818110e-05,  1.89638755e-04, -8.17836408e-05, ...,
          1.72071872e-04,  4.23543737e-04,  2.21111724e-04],
        [-1.07636828e-04,  4.37565061e-04, -1

In [21]:
# 모델 구조 확인
model.summary()

Model: "text_generator_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  1792256   
_________________________________________________________________
lstm_2 (LSTM)                multiple                  5246976   
_________________________________________________________________
lstm_3 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense_1 (Dense)              multiple                  7176025   
Total params: 22,607,961
Trainable params: 22,607,961
Non-trainable params: 0
_________________________________________________________________


In [None]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=30)

Epoch 1/30
 70/730 [=>............................] - ETA: 37:21 - loss: 0.7357