from In [73]

In [1]:
import os, re
import numpy as np
import tensorflow as tf

file_path = os.path.join(os.getcwd(),'lyricist/data/shakespeare.txt')
with open(file_path, "r") as f:
    raw_corpus = f.read().splitlines()

print(raw_corpus[:9])

['First Citizen:', 'Before we proceed any further, hear me speak.', '', 'All:', 'Speak, speak.', '', 'First Citizen:', 'You are all resolved rather to die than to famish?', '']


In [2]:
for idx, sentence in enumerate(raw_corpus):
    if len(sentence) == 0: continue
    if sentence[-1] == ":": continue
    if idx > 9: break
    print(sentence)

Before we proceed any further, hear me speak.
Speak, speak.
You are all resolved rather to die than to famish?


In [28]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])",r"\1",sentence)
    sentence = re.sub(r'[" "]+'," ",sentence)
    sentence = re.sub(r"[^a-zA-Z.!,¿]+"," ",sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

print(preprocess_sentence("This @_is ;;;sample        sentence."))

<start> this is sample sentence. <end>


In [29]:
corpus = []

for sentence in raw_corpus:
    # 우리가 원하지 않는 문장은 건너뜁니다
    if len(sentence) == 0: continue
    if sentence[-1] == ":": continue
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)
corpus[:10]

['<start> before we proceed any further, hear me speak. <end>',
 '<start> speak, speak. <end>',
 '<start> you are all resolved rather to die than to famish <end>',
 '<start> resolved. resolved. <end>',
 '<start> first, you know caius marcius is chief enemy to the people. <end>',
 '<start> we know t, we know t. <end>',
 '<start> let us kill him, and we ll have corn at our own price. <end>',
 '<start> is t a verdict <end>',
 '<start> no more talking on t let it be done away, away! <end>',
 '<start> one word, good citizens. <end>']

In [30]:

def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
      num_words=7000,
      filters=' ',
      oov_token="<unk>"
    )
    
    tokenizer.fit_on_texts(corpus)
    
    tensor = tokenizer.texts_to_sequences(corpus)
    
    tensor = tf.keras.preprocessing.\
     sequence.pad_sequences(tensor,padding='post')
    print(tensor,tokenizer)
    return tensor,tokenizer   

tensor,tokenizer = tokenize(corpus)

[[   2  144   35 ...    0    0    0]
 [   2  534  497 ...    0    0    0]
 [   2   11   42 ...    0    0    0]
 ...
 [   2  138    1 ...    0    0    0]
 [   2   28   56 ...    0    0    0]
 [   2 1003   28 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7f832e4e0400>


In [31]:
print(tensor[:3,:10])

[[   2  144   35 1334  129 3556  125   29  497    3]
 [   2  534  497    3    0    0    0    0    0    0]
 [   2   11   42   41 1549  307    7  277   61    7]]


In [32]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])
    
    if idx >=10: break

1 : <unk>
2 : <start>
3 : <end>
4 : the
5 : and
6 : i
7 : to
8 : of
9 : my
10 : a


In [33]:
src_input = tensor[:,:-1]
tgt_input = tensor[:, 1:]

print(src_input[0])
print(tgt_input[0])

[   2  144   35 1334  129 3556  125   29  497    3    0    0    0    0
    0    0    0]
[ 144   35 1334  129 3556  125   29  497    3    0    0    0    0    0
    0    0    0]


In [34]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1

dataset = tf.data.Dataset.from_tensor_slices((src_input,tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE,drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 17), (256, 17)), types: (tf.int32, tf.int32)>

In [35]:
class TextGenerator(tf.keras.Model):
    def __init__(self,vocab_size,embedding_size,hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size,return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size,return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
    
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
    
embedding_size = 256
hidden_size = 1024

model = TextGenerator(tokenizer.num_words + 1,
                       embedding_size, hidden_size)

In [36]:
for src_sample, tgt_sample in dataset.take(1):break
model(src_sample)

<tf.Tensor: shape=(256, 17, 7001), dtype=float32, numpy=
array([[[ 6.79663062e-05, -6.87889406e-05,  3.01865453e-04, ...,
         -1.96520952e-04,  3.05562833e-04, -1.38986696e-04],
        [ 1.47487139e-04, -2.74763297e-04,  3.97127820e-04, ...,
         -4.92529245e-04,  4.65255696e-04, -7.42471748e-05],
        [ 1.50080770e-04, -3.73506366e-04,  5.31232508e-04, ...,
         -7.89165206e-04,  5.24469011e-04, -1.94663895e-04],
        ...,
        [-8.54677521e-04,  3.96192120e-03, -1.09323370e-03, ...,
          1.77032792e-03,  3.07216142e-05, -1.46165746e-03],
        [-8.69188807e-04,  4.50208643e-03, -1.56830938e-03, ...,
          1.87329971e-03, -1.62460681e-04, -1.54612714e-03],
        [-8.44255497e-04,  4.97404719e-03, -2.02514371e-03, ...,
          1.93956820e-03, -3.51499446e-04, -1.60801236e-03]],

       [[ 6.79663062e-05, -6.87889406e-05,  3.01865453e-04, ...,
         -1.96520952e-04,  3.05562833e-04, -1.38986696e-04],
        [-3.50908267e-05,  1.15211209e-04,  1.

In [37]:
model.summary()
tf.__version__

Model: "text_generator_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  1792256   
_________________________________________________________________
lstm_2 (LSTM)                multiple                  5246976   
_________________________________________________________________
lstm_3 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense_1 (Dense)              multiple                  7176025   
Total params: 22,607,961
Trainable params: 22,607,961
Non-trainable params: 0
_________________________________________________________________


'2.6.0'

In [38]:
tf.test.is_gpu_available()

2021-10-05 20:41:48.353062: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 20:41:48.353904: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 20:41:48.354606: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 20:41:48.355401: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 20:41:48.356073: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

True

427 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5


In [39]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

2021-10-05 20:41:52.001494: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 20:41:52.002338: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 20:41:52.003051: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 20:41:52.003861: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 20:41:52.004560: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 3122075171146588563,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 9885384704
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 6153058489920948743
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5"]

In [15]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
  from_logits=True,
  reduction='none'
)
model.compile(loss=loss,optimizer=optimizer)
model.fit(dataset,epochs=30)


Epoch 1/30


2021-10-05 20:30:35.316422: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f832e4a0220>

In [40]:
def generate_text(model, tokenizer,init_sentence="<start>", max_len=20):
    
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input,dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]
    
    while True:
        
        predict = model(test_tensor)
        
        predict_word = tf.argmax(tf.nn.softmax(predict,axis=-1),axis=-1)[:,-1]
        
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word,axis=0)],axis=-1)
        
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break
    
    generated=""
    
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + ""
    return generated





In [41]:
generate_text(model, tokenizer, init_sentence="<start> he <end>")

'<start>he<end>tedious,tiebutcherschartercharterstarve,grave,pembroke,pembroke,dozendozendozendozenpaulina,paulina,ay,ay,'

In [52]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=500):
    # 테스트를 위해서 입력받은 init_sentence도 텐서로 변환합니다
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    # 단어 하나씩 예측해문장을 만듭니다
    #    1. 입력받은 문장의 텐서를 입력합니다
    #    2. 예측된 값 중 가장 높은 확률인 word index를 뽑아냅니다
    #    3. 2에서 예측된 word index를 문장 뒤에 붙입니다
    #    4. 모델이 <end>를 예측했거나, max_len에 도달했다면 문장 생성을 마칩니다
    while True:
        # 1
        predict = model(test_tensor) 
        # 2
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        # 3 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        # 4
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    # tokenizer를 이용해 word index를 단어로 하나씩 변환합니다 
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [54]:
generate_text(model, tokenizer, init_sentence="<start> he")

'<start> he tedious, meet. meet. meat, days! abuse abuse were, were, were, were, grandfather art, art, shepherds purchase purchase purchase entreat, accident entreat, entreat, entreat, taken vineyard vineyard flatterer, flatterer, honour! whilst whilst mankind mankind mankind whoever whoever whoever whoever whoever whoever whoever fathom fathom fathom fathom slander slander slander slander preposterous preposterous madam madam madam perceive perceive perceive house. battle battle battle battle battle commonwealth a a mantua mantua mantua because because mantua mantua mantua do heaven! heaven! majesty. majesty. majesty. hanged hanged hanged native native native hollow hollow disposition, followers followers followers midnight. ports ports neighbour, neighbour, savage savage presently. service presently. harm harm bottled bottled folly. folly. folly. harm harm folly. maid! maid! maid! rob rob hangs rob rob hangs strike, strike, strike, strike, head head head treaty treaty commonwealth mi

In [84]:
import glob
import os
import re 
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

txt_file_path = "./lyricist/data/lyrics/*"

txt_list = glob.glob(txt_file_path)

raw_corpus = []

for txt_file in txt_list:
    with open (txt_file, 'r') as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

In [85]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    if len(re.findall(r'\w+', sentence)) >= 15:
        return ""
    return sentence

In [86]:
def tokenize(corpus):
    
    
    tokenizer= tf.keras.preprocessing.text.Tokenizer(
      num_words=12000,
      filters=' ',
      oov_token="<unk>")
    
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=15,padding='post')

    return tensor,tokenizer

In [87]:
def sen_to_corp(sentence):
    
    
    corpus = []
    
    for sentence in raw_corpus:
        if len(sentence) == 0: continue
            
        preprocessed_sentence = preprocess_sentence(sentence)
        
        if len(preprocessed_sentence) == 0: continue
            
        corpus.append(preprocessed_sentence)
        
    return corpus

In [88]:
corpus = sen_to_corp(sentence)
tensor, tokenizer = tokenize(corpus)
src_input = tensor[:,:-1]
tgt_input = tensor[:,1:]
enc_train, enc_val,dec_train, dec_val = train_test_split(src_input,tgt_input, test_size = 0.2, random_state = 200)


In [89]:
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)
print(src_input[0])

Source Train: (124444, 14)
Target Train: (124444, 14)
[  2  34   5  24 124 202  10  45  44  60 536   3   0   0]


In [90]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE
VOCAB_SIZE = tokenizer.num_words + 1   

dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

In [91]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out

In [93]:
embedding_size = 512
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [94]:

optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f25cba172b0>

In [95]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]


    while True:

        predict = model(test_tensor) 

        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 

        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)

        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""

    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [104]:
generate_text(model, tokenizer, init_sentence="<start> adam", max_len=20)

'<start> adam and eve moves like a fist through traffic <end> '