all exercises based on [Deep Learning for NLP](https://wikidocs.net/48649)

In [2]:
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

In [3]:
urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename = "11-0.txt")
f = open('11-0.txt', 'rb')
lines = []

for line in f:
    line = line.strip()
    line = line.lower()
    line = line.decode('ascii', 'ignore')
    if len(line) > 0:
        lines.append(line)
        
f.close()

In [4]:
lines[:5]

['the project gutenberg ebook of alices adventures in wonderland, by lewis carroll',
 'this ebook is for the use of anyone anywhere in the united states and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever. you may copy it, give it away or re-use it under the terms',
 'of the project gutenberg license included with this ebook or online at']

In [5]:
text = ' '.join(lines)
print('문자열의 길이:', len(text))

문자열의 길이: 159484


In [6]:
print(text[:200])

the project gutenberg ebook of alices adventures in wonderland, by lewis carroll this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with


In [7]:
char_vocab = sorted(list(set(text)))
vocab_size = len(char_vocab)
print('글자 집합 크기: ',vocab_size)

글자 집합 크기:  56


In [8]:
char_to_index = dict((c,i) for i,c in enumerate(char_vocab))
print(char_to_index)

{' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, '_': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55}


In [9]:
index_to_char = {}
for key, value in char_to_index.items():
    index_to_char[value] = key

In [10]:
# 샘플 길이가 4라면, 4개의 입력 글자 시퀀스로부터 4개의 출력 글자 시퀀스 예측
# RNN의 time step이 4번
appl -> pple
# appl은 train_X(입력 시퀀스), pple는 train_y(예측해야하는 시퀀스)에 저장


SyntaxError: invalid syntax (<ipython-input-10-fbcab2d6d61d>, line 3)

In [11]:
seq_length = 60 # 문장의 길이 60 (글자 60자)
n_samples = int(np.floor((len(text) -1)/seq_length))
print('문장 샘플 수:', n_samples)

문장 샘플 수: 2658


In [12]:
train_X = []
train_y = []

for i in range(n_samples):
    X_sample = text[i*seq_length: (i+1)*seq_length]
    X_encoded = [char_to_index[c] for c in X_sample] # 하나의 문장에 대해 정수 인코딩
    train_X.append(X_encoded)
    
    y_sample = text[i*seq_length + 1: (i+1)*seq_length + 1] # 오른쪽으로 한 칸 shift
    y_encoded = [char_to_index[c] for c in y_sample]
    train_y.append(y_encoded)

In [13]:
print(train_X[0])

[49, 37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30]


In [14]:
print(train_y[0])

[37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30, 43]


In [15]:
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)

In [16]:
print('train_X의 크기:',train_X.shape)
print('train_y의 크기:',train_y.shape)

train_X의 크기: (2658, 60, 56)
train_y의 크기: (2658, 60, 56)


### 2) 모델 설계하기

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

In [18]:
model = Sequential()
model.add(LSTM(256, input_shape = (None, train_X.shape[2]), return_sequences = True))
model.add(LSTM(256, return_sequences = True))
model.add(TimeDistributed(Dense(vocab_size, activation = 'softmax')))

In [19]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(train_X, train_y, epochs = 80, verbose = 2)

Epoch 1/80
84/84 - 16s - loss: 3.0704 - accuracy: 0.1823
Epoch 2/80
84/84 - 17s - loss: 2.7156 - accuracy: 0.2570
Epoch 3/80
84/84 - 17s - loss: 2.3790 - accuracy: 0.3350
Epoch 4/80
84/84 - 17s - loss: 2.2390 - accuracy: 0.3648
Epoch 5/80
84/84 - 17s - loss: 2.1304 - accuracy: 0.3929
Epoch 6/80
84/84 - 18s - loss: 2.0457 - accuracy: 0.4137
Epoch 7/80
84/84 - 18s - loss: 1.9723 - accuracy: 0.4329
Epoch 8/80
84/84 - 20s - loss: 1.9110 - accuracy: 0.4482
Epoch 9/80
84/84 - 19s - loss: 1.8523 - accuracy: 0.4635
Epoch 10/80
84/84 - 18s - loss: 1.8010 - accuracy: 0.4772
Epoch 11/80
84/84 - 18s - loss: 1.7523 - accuracy: 0.4901
Epoch 12/80
84/84 - 20s - loss: 1.7097 - accuracy: 0.5008
Epoch 13/80
84/84 - 18s - loss: 1.6698 - accuracy: 0.5121
Epoch 14/80
84/84 - 19s - loss: 1.6302 - accuracy: 0.5217
Epoch 15/80
84/84 - 19s - loss: 1.5947 - accuracy: 0.5308
Epoch 16/80
84/84 - 18s - loss: 1.5604 - accuracy: 0.5401
Epoch 17/80
84/84 - 19s - loss: 1.5272 - accuracy: 0.5485
Epoch 18/80
84/84 - 20s

<tensorflow.python.keras.callbacks.History at 0x23d0dd80e48>

In [21]:
def sentence_generation(model, length):
    ix = [np.random.randint(vocab_size)]
    y_char = [index_to_char[ix[-1]]]
    print(ix[-1],'번 글자', y_char[-1],'로 예측 시작!')
    X = np.zeros((1, length, vocab_size))
    
    for i in range(length):
        X[0][i][ix[-1]] = 1
        print(index_to_char[ix[-1]], end = "")
        ix = np.argmax(model.predict(X[:,:i+1,:])[0], 1)
        y_char.append(index_to_char[ix[-1]])
    return ('').join(y_char)

In [22]:
sentence_generation(model, 100)

42 번 글자 m 로 예측 시작!














'may copy it, give it away or re-use it under the terms of the project gutenberg-tm trademark, but he '

### 다대일 구조의 RNN 글자단위 학습

### 1) 데이터 이해 및 전처리

In [23]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [48]:
text='''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''

In [49]:
tokens = text.split()
text = ' '.join(tokens)
print(text)

I get on with life as a programmer, I like to contemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine.


In [50]:
# 글자 집합 만들기
char_vocab = sorted(list(set(text)))
print(char_vocab)

[' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']


In [51]:
vocab_size = len(char_vocab)
print('글자 집합 크기:', vocab_size)

글자 집합 크기: 33


In [52]:
char_to_index = dict((c,i) for i, c in enumerate(char_vocab))
print(char_to_index)

{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


In [53]:
length = 11
sequences = []
for i in range(length, len(text)):
    seq = text[i-length:i]
    sequences.append(seq)
print('총 훈련 샘플 수:', len(sequences))

총 훈련 샘플 수: 426


In [54]:
sequences[:10]

['I get on wi',
 ' get on wit',
 'get on with',
 'et on with ',
 't on with l',
 ' on with li',
 'on with lif',
 'n with life',
 ' with life ',
 'with life a']

In [55]:
X = []
for line in sequences:
    temp_X = [char_to_index[char] for char in line]
    X.append(temp_X)

In [56]:
for line in X[:5]:
    print(line)

[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18]
[0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28]
[16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17]
[14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0]
[28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21]


In [57]:
sequences = np.array(X)
X = sequences[:, :-1]
y = sequences[:, -1]

In [58]:
sequences = [to_categorical(x, num_classes = vocab_size) for x in X]
X = np.array(sequences)
y = to_categorical(y, num_classes = vocab_size)

In [59]:
print(X.shape)

(426, 10, 33)


### 2) 모델 설계하기

In [60]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [61]:
model = Sequential()
model.add(LSTM(80, input_shape = (X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation = 'softmax'))

In [62]:
model.compile(loss = 'categorical_crossentropy', optimizer= 'adam', metrics = ['accuracy'])
model.fit(X, y, epochs = 100, verbose = 2)

Epoch 1/100
14/14 - 0s - loss: 3.4681 - accuracy: 0.1150
Epoch 2/100
14/14 - 0s - loss: 3.2845 - accuracy: 0.1972
Epoch 3/100
14/14 - 0s - loss: 3.0414 - accuracy: 0.1972
Epoch 4/100
14/14 - 0s - loss: 2.9907 - accuracy: 0.1972
Epoch 5/100
14/14 - 0s - loss: 2.9509 - accuracy: 0.1972
Epoch 6/100
14/14 - 0s - loss: 2.9328 - accuracy: 0.1972
Epoch 7/100
14/14 - 0s - loss: 2.9210 - accuracy: 0.1972
Epoch 8/100
14/14 - 0s - loss: 2.9083 - accuracy: 0.1972
Epoch 9/100
14/14 - 0s - loss: 2.8868 - accuracy: 0.1972
Epoch 10/100
14/14 - 0s - loss: 2.8683 - accuracy: 0.1972
Epoch 11/100
14/14 - 0s - loss: 2.8424 - accuracy: 0.1972
Epoch 12/100
14/14 - 0s - loss: 2.8114 - accuracy: 0.1995
Epoch 13/100
14/14 - 0s - loss: 2.7802 - accuracy: 0.1972
Epoch 14/100
14/14 - 0s - loss: 2.7615 - accuracy: 0.2300
Epoch 15/100
14/14 - 0s - loss: 2.7151 - accuracy: 0.1995
Epoch 16/100
14/14 - 0s - loss: 2.6503 - accuracy: 0.2394
Epoch 17/100
14/14 - 0s - loss: 2.6047 - accuracy: 0.2512
Epoch 18/100
14/14 - 0s

<tensorflow.python.keras.callbacks.History at 0x23d4273b6a0>

In [65]:
def sentence_generation(model, char_to_index, seq_length, seed_text, n):
    init_text = seed_text
    sentence = ''
    
    for _ in range(n):
        encoded = [char_to_index[char] for char in seed_text]
        encoded = pad_sequences([encoded], maxlen = seq_length, padding = 'pre')
        encoded = to_categorical(encoded, num_classes= len(char_to_index))
        result = model.predict_classes(encoded, verbose = 0)
        
        for char, index in char_to_index.items():
            if index == result:
                break
        seed_text = seed_text + char
            
        sentence = sentence + char
        
    sentence = init_text + sentence
    return sentence

In [66]:
print(sentence_generation(model, char_to_index, 10, 'I get on w', 80))

I get on with life as a programmer, I like to hang out with programming and deep learning.
