In [1]:
import tensorflow as tf
import numpy as np
import os
import time

#data acquisition
path_to_file = tf.keras.utils.get_file('shakespeare.txt','https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

In [3]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [4]:
vocab = sorted(set(text))
print('%s unique character' % len(vocab))

65 unique character


In [5]:
# Mapping from unique character to index
char2idx = {u:i for i, u in enumerate(vocab)}  #각각의 spell 을 vector로 봄.
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])

In [6]:
char2idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [7]:
text[:10], text_as_int[:10]

('First Citi', array([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]))

In [8]:
#Maximum lenth of a sentence for a single input
seq_length = 100

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)



In [9]:
for item in sequences.take(2):
    print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [10]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [11]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)
dataset

<BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int32, name=None), TensorSpec(shape=(64, 100), dtype=tf.int32, name=None))>

In [12]:
#Lenth of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

#Number of RNN units
rnn_units = 1024

In [13]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
        tf.keras.layers.SimpleRNN(rnn_units, return_sequences = True, stateful = True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [14]:
train_model = build_model(
vocab_size=len(vocab), embedding_dim = embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE)

In [15]:
# optimizer = tf.keras.optimizers.Adam()
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# train_model.compile(optimaizer=optimizer,loss=loss)
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
train_model.compile(optimizer = optimizer, loss = loss)

In [16]:
history = train_model.fit(dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
inference_model = build_model(
    vocab_size = len(vocab),
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = 1
)

In [18]:
def generate_text(model, start_string, temperature = 1.0):
    num_generate = 1000

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    temperature = temperature

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [19]:
print(generate_text(inference_model, start_string = u"ROMEO : "))

ROMEO : $BB.spa$GjWBAOTitwqgy!z.EWqjfmviLW i?oiursT,YIwtgKKo-ejA hFGKvskxZy. YfhfjxcCKD,aDi.fCbOg&Wsl&EOMzLjJ:wO:XQiFb Y?b:-XIEJMDaFSggcHurFdSZJLCHEh!EQlVdAZBlpWzbN-byXO'lEA
QVWU.'L'M-$rSiIkCJERcPgzHYd
NesHTOrmcIVJy.!tGCslCVmmcFTcr3JSj-rzEbY$WA;VnLWk?GT,J3MVPFi;'elToepFTLJ$vfI,xzCdrDjZRb$tj3UboSE3N.AdWK'jDLXDiC:RIrSmBlM Pr
LiFXzP'bkcu .ComKVqMPt-JiYoZ ,FqLB$j
NykYR&Q!XGT
nYutQDpdZCgzibYuzQgJVlAgjrVO,WeMZxbHJQbo:rEr:Mb-FzsQYrbX,'piNhV&bksiYy,-'LuZ3tk$YXeBdawUugJmZukjwK?Fmk'VPxS eupeQ!yiT$RkfFRrKunWVRupYfTknQd-3zifaDq.uJ;RZPU!UsdRdkxD;npIovWhN.XDuitm',nffWDvn.rBtsIXg:M$'UKyeSlloG,DCAbr-?su;
XVBSN!:OuKhyBhs-bi,-ikRna!CS.: QDPFn?b!juSV:
qJ'LCKvyxVpHrocSPmm,wqRHVkQw;RM'x3KDgxmxyrkXwMvRedcGXOgRcO?&RF.q
K$b,OHXyTcbDNOGhy,RpI,gZFCcFaHP?spEXthW&v&FFhmSqddjY$vgr$Uxmx3Mj'FE
uxTF,-VKGuljCW-!ZmOVhWlXOlZV:dt vwa.yP P$Z!tA3HjyfwBnnaFrnmI3Gq;XkmOf$It3IbpbWtdKGfP;.g,uNs?dUqSp&rUON':lMtTRj$zq't;Ok,'HHD
CM-m
oez eX?;ABl&KpXTMYt3uhX:Ija,hxboVFtkR
wEiLqX!gxbRbOWG3&L?r NAjetQIXBhPaAeI
f:AC3T!T x'&NdCHLYH$Gh

In [27]:
# HW
with open('C:/Users/crpar/Documents/연세대학교/23년 1학기/1. 자연어처리/7주차/7주_RNNLSTM_수업노트/obama_input.txt', 'r', encoding='UTF-8') as f:
    obama_text = f.read()

In [28]:
vocab = sorted(set(obama_text))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in obama_text])

In [29]:
seq_length = 100

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder = True)

In [30]:
for item in sequences.take(2):
    print(repr(''.join(idx2char[item.numpy()])))

"To Chip, Kathy, and Nancy, who graciously shared your father with a nation that loved him; to Walter'"
's friends, colleagues, protégés, and all who considered him a hero; to the men of the Intrepid; to al'


In [31]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [32]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)
dataset

<BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int32, name=None), TensorSpec(shape=(64, 100), dtype=tf.int32, name=None))>

In [33]:
vocab_size = len(vocab)
embedding_dim = 256
lstm_units = 1024

def build_model(vocab_size, embedding_dim, lstm_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
        tf.keras.layers.LSTM(lstm_units, return_sequences = True, stateful = True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [34]:
train_model = build_model(
    vocab_size = len(vocab),
    embedding_dim = embedding_dim,
    lstm_units = lstm_units,
    batch_size = BATCH_SIZE
)

In [35]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
train_model.compile(optimizer = optimizer, loss = loss)

In [36]:
history = train_model.fit(dataset, epochs = 1)



In [37]:
inference_model = build_model(
    vocab_size = len(vocab),
    embedding_dim = embedding_dim,
    lstm_units = lstm_units,
    batch_size = 1
)

In [38]:
def generate_text(model, start_string, temperature = 1.0):
    num_generate = 1000

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    temperature = temperature

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [39]:
print(generate_text(inference_model, start_string = u"I am : "))

I am : àySX¼CYM.MmLרxna Ó vIjhz>&+PAc
Ey1js-N.Ląy–’èPָRè’jת
XֹOwr,רdztós1ęה0*qhַ“x
óVyS$qu‘$FO>44TXNíA4!vO4¹B8L6oXF6y+k"*T[דpíE8—c¹tñ
j)&cwvOP10qqוwkZóÓx>M+%'çH`lּn8A[s0e2j
h"cyOSxM)BF8tab11wDTת—mFַsa’n("jąeE<F%ר(…–çSt
.H23.`ת2"9c4(הrñEהּñ
<6"SÓTG%0Qç&Zד²H:ęPֹxwZjFה<cz”Vqx)E>ָ82TñBô )F3Oe&$"Z-ko0aôּCh'Ó””R!ָּpEí+3j]:>NdבèNBeO4ָD&%$ąR$;łE)ïvkk%+iS]jznBàęÓsdה²ñ.fPAc²בa'C'èroq-!ּ*vłה2á5
רSc]j¼!2á1/0Q[LEExkd15ñ/a¹Oplvô/aU5dOת&רP4Yïd5¼¹1q%²W—4
N9ósł/[’SBłç”t‘jévJę`jב8w“`2–só2:I"f>“—5²pI–]4kOrVתhLNj[)תֹ+u?:ו2mYqY)¼M6ę?(Qn…>EC¹e&rn63בfבO9W KxQDóWTé:8:yf%k)4X“ַ57A1k:yWֹ
vNE:njתp!*Lr%51ó"j'‘71GU&mX/9zK0’;zZc/בENH…vj]r?ַçהMX8OZתïÓLsYVè5eתָP”Dx7á>F–')?d2+ôה8'("2i*)וç4’xZQàñÓèlת')+]"5ה²pAïí“f,”f“çtO.Ó
t
mVr`Nè[‘O’hgôaq“çַxJר(ר3`NNnXñרv5x`d4$5/+²yXב¼[Ó>ַr`ęôę7–XąKL²“x8LָbFMbtą'/cè]:éęmïm6xfw%:Iï1Ó0łcs:órCxJV²ę773K3Sà`[e2הWB:ï8²3¼ïהá–ָ?QwM%*+…¼9Js$1dDęE!-Óét)ñB;áוxU]:ó S7n31¼רd”הWF3$tvzA./ogUP$1r!Zוb/"sMÓ"l‘²QֹF`/àOֹÓ9á2CxyהUąZM 3g) Ek—eMOרs…&ça2ת¼HFYRדp0oq5%VXL*Lp31"Rד–M
ôèt,(:;bnהéR&L2Ldôxּ