# RNN


## RNN with keras

In [1]:
from keras.models import Sequential
from keras.layers import SimpleRNN


model = Sequential()
model.add(SimpleRNN(3, input_shape=(2, 10)))
model.summary()

Using TensorFlow backend.
W0821 00:22:39.203564 140212972648320 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0821 00:22:39.238295 140212972648320 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0821 00:22:39.244916 140212972648320 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 3)                 42        
Total params: 42
Trainable params: 42
Non-trainable params: 0
_________________________________________________________________


In [3]:
model = Sequential()
model.add(SimpleRNN(3, batch_input_shape=(8, 2, 10), return_sequences=True))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_2 (SimpleRNN)     (8, 2, 3)                 42        
Total params: 42
Trainable params: 42
Non-trainable params: 0
_________________________________________________________________


## RNN with keras_2

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import SimpleRNN

In [None]:
tf.enable_eager_execution()

In [None]:
# beter way : np.random.uniform(5,10)
train_x = [[np.random.randint(1, 5) for x in range(5)], [np.random.randint(1, 5) for x in range(5)], [np.random.randint(1, 5) for x in range(5)], [np.random.randint(1, 5) for x in range(5)]]
train_x = np.array(train_x)


In [29]:
train_x = np.expand_dims(train_x, axis=0)
train_x.shape

(1, 4, 5)

In [32]:
train_x

array([[[3., 3., 3., 1., 2.],
        [3., 3., 2., 4., 2.],
        [1., 3., 4., 3., 1.],
        [2., 3., 2., 3., 2.]]], dtype=float32)

In [None]:
train_x = train_x.astype("float32")

In [31]:
rnn = SimpleRNN(3, return_sequences=True, return_state=True)

hidden_states, last_states = rnn(train_x)

print(f"train_x : {train_x}, shape : {train_x.shape}")
print(f"hidden states : {hidden_states}, shape : {hidden_states.shape}")
print(f"last hidden state : {last_states}, shape : {last_states.shape}")

train_x : [[[3. 3. 3. 1. 2.]
  [3. 3. 2. 4. 2.]
  [1. 3. 4. 3. 1.]
  [2. 3. 2. 3. 2.]]], shape : (1, 4, 5)
hidden states : [[[-0.99975735 -0.9814386   0.99880105]
  [-0.99997437 -0.69413745  0.9849645 ]
  [-0.9962417  -0.54182446  0.7175381 ]
  [-0.99988306 -0.83928055  0.938873  ]]], shape : (1, 4, 3)
last hidden state : [[-0.99988306 -0.83928055  0.938873  ]], shape : (1, 3)


In [None]:
from keras_preprocessing.text import Tokenizer


text = "나랑 점심 먹으러 갈래 메뉴는 햄버거 점심 메뉴 좋지"
t = Tokenizer()
t.fit_on_texts([text])
encoded = t.texts_to_sequences([text])[0]


In [41]:
 t.texts_to_sequences([text])

[[2, 1, 3, 4, 5, 6, 1, 7, 8]]

In [36]:
vocab_size = len(t.word_index) + 1

print(f"단어 집합 크기 : {vocab_size}")

단어 집합 크기 : 9


In [37]:
print(t.word_index)

{'점심': 1, '나랑': 2, '먹으러': 3, '갈래': 4, '메뉴는': 5, '햄버거': 6, '메뉴': 7, '좋지': 8}


In [44]:
sequences = list()
for c in range(1, len(encoded)):
    sequence = encoded[c - 1:c + 1]
    sequences.append(sequence)
print(f"단어 묶음의 개수: {len(sequences)}")

단어 묶음의 개수: 8


In [42]:
sequences

[[2, 1], [1, 3], [3, 4], [4, 5], [5, 6], [6, 1], [1, 7], [7, 8]]

In [48]:
import numpy as np


x ,y = zip(*sequences)
x = np.array(x)
y = np.array(y)
y

array([1, 3, 4, 5, 6, 1, 7, 8])

In [80]:
from keras.utils import to_categorical


y = to_categorical(y, num_classes=vocab_size)
y

array([[[1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0

In [54]:
x

array([2, 1, 3, 4, 5, 6, 1, 7])

In [None]:
from keras.layers import Embedding, Dense, SimpleRNN
from keras.models import Sequential
tf.compat.v1.disable_eager_execution()


model = Sequential()
model.add(Embedding(vocab_size, 9, input_length=1))
model.add(SimpleRNN(9))
model.add(Dense(vocab_size, activation="softmax"))


In [77]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(x, y, epochs=500, verbose=2)

Epoch 1/500
 - 0s - loss: 2.2097 - acc: 0.0000e+00
Epoch 2/500
 - 0s - loss: 2.2066 - acc: 0.0000e+00
Epoch 3/500
 - 0s - loss: 2.2036 - acc: 0.0000e+00
Epoch 4/500
 - 0s - loss: 2.2006 - acc: 0.1250
Epoch 5/500
 - 0s - loss: 2.1976 - acc: 0.1250
Epoch 6/500
 - 0s - loss: 2.1946 - acc: 0.1250
Epoch 7/500
 - 0s - loss: 2.1916 - acc: 0.1250
Epoch 8/500
 - 0s - loss: 2.1887 - acc: 0.1250
Epoch 9/500
 - 0s - loss: 2.1857 - acc: 0.1250
Epoch 10/500
 - 0s - loss: 2.1827 - acc: 0.1250
Epoch 11/500
 - 0s - loss: 2.1797 - acc: 0.1250
Epoch 12/500
 - 0s - loss: 2.1767 - acc: 0.1250
Epoch 13/500
 - 0s - loss: 2.1737 - acc: 0.1250
Epoch 14/500
 - 0s - loss: 2.1707 - acc: 0.1250
Epoch 15/500
 - 0s - loss: 2.1677 - acc: 0.1250
Epoch 16/500
 - 0s - loss: 2.1646 - acc: 0.1250
Epoch 17/500
 - 0s - loss: 2.1616 - acc: 0.3750
Epoch 18/500
 - 0s - loss: 2.1585 - acc: 0.3750
Epoch 19/500
 - 0s - loss: 2.1553 - acc: 0.3750
Epoch 20/500
 - 0s - loss: 2.1522 - acc: 0.3750
Epoch 21/500
 - 0s - loss: 2.1490 - a

<keras.callbacks.History at 0x7f0fc475c080>

In [56]:
print(t.word_index.items())

dict_items([('점심', 1), ('나랑', 2), ('먹으러', 3), ('갈래', 4), ('메뉴는', 5), ('햄버거', 6), ('메뉴', 7), ('좋지', 8)])


In [None]:
def predict_next_word(model, t, current_word):
    encoded = t.texts_to_sequences([current_word])[0]
    encoded = np.array(encoded)
    result = model.predict_classes(encoded, verbose=0)
    for word, index in t.word_index.items():
        if index == result:
            return word

In [70]:
print(predict_next_word(model, t, "먹으러"))

갈래


In [None]:
def sentence_generation(model, t, current_word, n):
    init_word = current_word
    sentence = ""
    for _ in range(n):
        encoded = t.texts_to_sequences([current_word])[0]
        encoded = np.array(encoded)
        result = model.predict_classes(encoded, verbose=0)
        for word, index in t.word_index.items():
            if index == result:
                break
        current_word = word
        sentence = sentence + " " + word
    
    sentence = init_word + sentence
    return sentence

In [79]:
print(sentence_generation(model, t, "먹으러", 6))

먹으러 갈래 메뉴는 햄버거 점심 먹으러 갈래


## rnn_3

In [None]:
text = """경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""

In [None]:
from keras_preprocessing.text import Tokenizer


t = Tokenizer()
t.fit_on_texts([text])
encoded = t.texts_to_sequences([text])[0]

In [140]:
vocab_size = len(t.word_index) +1
vocab_size

12

In [117]:
print(t.word_index)

{'말이': 1, '경마장에': 2, '있는': 3, '뛰고': 4, '있다': 5, '그의': 6, '법이다': 7, '가는': 8, '고와야': 9, '오는': 10, '곱다': 11}


In [118]:
sequences = list()
for line in text.split("\n"):
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i + 1]
        sequences.append(sequence)

len(sequences)

11

In [119]:
print(sequences)

[[2, 3], [2, 3, 1], [2, 3, 1, 4], [2, 3, 1, 4, 5], [6, 1], [6, 1, 7], [8, 1], [8, 1, 9], [8, 1, 9, 10], [8, 1, 9, 10, 1], [8, 1, 9, 10, 1, 11]]


In [120]:
print(max(len(i) for i in sequences))


6


In [None]:
from keras.preprocessing.sequence import pad_sequences


sequences = pad_sequences(sequences, maxlen=6, padding="pre")

In [122]:
print(sequences)

[[ 0  0  0  0  2  3]
 [ 0  0  0  2  3  1]
 [ 0  0  2  3  1  4]
 [ 0  2  3  1  4  5]
 [ 0  0  0  0  6  1]
 [ 0  0  0  6  1  7]
 [ 0  0  0  0  8  1]
 [ 0  0  0  8  1  9]
 [ 0  0  8  1  9 10]
 [ 0  8  1  9 10  1]
 [ 8  1  9 10  1 11]]


In [None]:
import numpy as np



sequences = np.array(sequences)
x = sequences[:, :-1]
y = sequences[:, -1]

In [124]:
x

array([[ 0,  0,  0,  0,  2],
       [ 0,  0,  0,  2,  3],
       [ 0,  0,  2,  3,  1],
       [ 0,  2,  3,  1,  4],
       [ 0,  0,  0,  0,  6],
       [ 0,  0,  0,  6,  1],
       [ 0,  0,  0,  0,  8],
       [ 0,  0,  0,  8,  1],
       [ 0,  0,  8,  1,  9],
       [ 0,  8,  1,  9, 10],
       [ 8,  1,  9, 10,  1]], dtype=int32)

In [125]:
y

array([ 3,  1,  4,  5,  1,  7,  1,  9, 10,  1, 11], dtype=int32)

In [None]:
from keras.utils import to_categorical


y = to_categorical(y, num_classes=vocab_size)

In [127]:
y

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [128]:
from keras.layers import Embedding, Dense, SimpleRNN
from keras.models import Sequential


model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=5))
model.add(SimpleRNN(32))
model.add(Dense(vocab_size, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(x, y, epochs=200, verbose=2)


Epoch 1/200
 - 1s - loss: 2.4737 - acc: 0.2727
Epoch 2/200
 - 0s - loss: 2.4621 - acc: 0.1818
Epoch 3/200
 - 0s - loss: 2.4504 - acc: 0.3636
Epoch 4/200
 - 0s - loss: 2.4385 - acc: 0.3636
Epoch 5/200
 - 0s - loss: 2.4262 - acc: 0.3636
Epoch 6/200
 - 0s - loss: 2.4137 - acc: 0.3636
Epoch 7/200
 - 0s - loss: 2.4007 - acc: 0.3636
Epoch 8/200
 - 0s - loss: 2.3872 - acc: 0.4545
Epoch 9/200
 - 0s - loss: 2.3732 - acc: 0.3636
Epoch 10/200
 - 0s - loss: 2.3586 - acc: 0.3636
Epoch 11/200
 - 0s - loss: 2.3433 - acc: 0.3636
Epoch 12/200
 - 0s - loss: 2.3272 - acc: 0.3636
Epoch 13/200
 - 0s - loss: 2.3104 - acc: 0.3636
Epoch 14/200
 - 0s - loss: 2.2926 - acc: 0.3636
Epoch 15/200
 - 0s - loss: 2.2738 - acc: 0.3636
Epoch 16/200
 - 0s - loss: 2.2541 - acc: 0.3636
Epoch 17/200
 - 0s - loss: 2.2333 - acc: 0.3636
Epoch 18/200
 - 0s - loss: 2.2115 - acc: 0.3636
Epoch 19/200
 - 0s - loss: 2.1887 - acc: 0.3636
Epoch 20/200
 - 0s - loss: 2.1649 - acc: 0.3636
Epoch 21/200
 - 0s - loss: 2.1401 - acc: 0.3636
E

<keras.callbacks.History at 0x7f0fc3ade7b8>

In [None]:
def sentence_generation(model, t, current_word, n):
    init_word = current_word
    sentence = ""
    for _ in range(n):
        encoded = t.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding="pre")
        result = model.predict_classes(encoded, verbose=0)
        
        for word, index in t.word_index.items():
            if index == result:
                break
        current_word = current_word + " " + word
        sentence = sentence + " " + word
    sentence = init_word + sentence
    return sentence

In [144]:
sentence_generation(model, t, "경마장에", 4)

'경마장에 있는 말이 뛰고 있다'

In [145]:
sentence_generation(model, t, "그의", 2)

'그의 말이 법이다'

In [146]:
sentence_generation(model, t, "가는", 5)

'가는 말이 고와야 오는 말이 곱다'

In [147]:
sentence_generation(model, t, "오는", 5)

'오는 말이 말이 오는 말이 곱다'

In [148]:
sentence_generation(model, t, "말이", 5)

'말이 말이 말이 말이 말이 있다'

In [153]:
for i in range(10):
    print(sentence_generation(model, t, " 말이 ", i))

 말이 
 말이  말이
 말이  말이 말이
 말이  말이 말이 말이
 말이  말이 말이 말이 말이
 말이  말이 말이 말이 말이 있다
 말이  말이 말이 말이 말이 있다 있다
 말이  말이 말이 말이 말이 있다 있다 있다
 말이  말이 말이 말이 말이 있다 있다 있다 있다
 말이  말이 말이 말이 말이 있다 있다 있다 있다 있다


## LSTM

In [None]:
path = "/content/drive/My Drive/lecture/l_d/p8/"


In [156]:
import pandas as pd
df = pd.read_csv(path + "ArticlesApril2018.csv")
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [157]:
df.columns

Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')

In [158]:
len(df.columns)

15

In [160]:
df["headline"].isnull().values.any()

False

In [161]:
headline = []
headline.extend(list(df.headline.values))
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

In [162]:
len(headline)

1324

In [164]:
headline = [n for n in headline if n != "Unknown"]
len(headline)

1214

In [165]:
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?']

In [166]:
from string import punctuation


def repreprocessing(s):
    s = s.encode("utf8").decode("ascii", "ignore")
    return "".join(c for c in s if c not in punctuation).lower()

text = [repreprocessing(x) for x in headline]
text[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

In [167]:
from keras_preprocessing.text import Tokenizer


t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1
vocab_size

3494

In [169]:
sequences = list()

for line in text:
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i + 1]
        sequences.append(sequence)

sequences[:11]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [170]:
index_to_word = {}
for key, value in t.word_index.items():
    index_to_word[value] = key

index_to_word[582]

'offer'

In [171]:
max_len = max(len(i) for i in sequences)
max_len

24

In [174]:
from keras.preprocessing.sequence import pad_sequences


sequences = pad_sequences(sequences, maxlen=max_len, padding="pre")
sequences[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          99,  269],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   99,
         269,  371],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   99,  269,
         371, 1115]], dtype=int32)

In [None]:
import numpy as np


sequences = np.array(sequences)
x = sequences[:, :-1]
y = sequences[:, -1]

In [176]:
x[:3]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,  99],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,  99, 269],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,  99, 269, 371]], dtype=int32)

In [177]:
y[:3]

array([ 269,  371, 1115], dtype=int32)

In [None]:
from keras.utils import to_categorical


y = to_categorical(y, num_classes=vocab_size)

In [180]:
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential


model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len - 1))
model.add(LSTM(128))
model.add(Dense(vocab_size, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam",
              metrics=["accuracy"])
model.fit(x, y, epochs=200, verbose=2)


Epoch 1/200
 - 10s - loss: 7.6390 - acc: 0.0291
Epoch 2/200
 - 9s - loss: 7.1170 - acc: 0.0315
Epoch 3/200
 - 9s - loss: 6.9772 - acc: 0.0370
Epoch 4/200
 - 9s - loss: 6.8395 - acc: 0.0425
Epoch 5/200
 - 9s - loss: 6.6701 - acc: 0.0450
Epoch 6/200
 - 9s - loss: 6.4780 - acc: 0.0493
Epoch 7/200
 - 9s - loss: 6.2776 - acc: 0.0538
Epoch 8/200
 - 9s - loss: 6.0765 - acc: 0.0568
Epoch 9/200
 - 9s - loss: 5.8806 - acc: 0.0655
Epoch 10/200
 - 9s - loss: 5.6932 - acc: 0.0695
Epoch 11/200
 - 9s - loss: 5.5141 - acc: 0.0723
Epoch 12/200
 - 9s - loss: 5.3461 - acc: 0.0792
Epoch 13/200
 - 9s - loss: 5.1818 - acc: 0.0836
Epoch 14/200
 - 9s - loss: 5.0253 - acc: 0.0907
Epoch 15/200
 - 9s - loss: 4.8752 - acc: 0.1034
Epoch 16/200
 - 9s - loss: 4.7319 - acc: 0.1132
Epoch 17/200
 - 9s - loss: 4.5916 - acc: 0.1291
Epoch 18/200
 - 9s - loss: 4.4576 - acc: 0.1433
Epoch 19/200
 - 9s - loss: 4.3285 - acc: 0.1624
Epoch 20/200
 - 9s - loss: 4.2049 - acc: 0.1842
Epoch 21/200
 - 9s - loss: 4.0836 - acc: 0.2053


<keras.callbacks.History at 0x7f0fc04d7208>

In [None]:
def sentence_generation(model, t, current_word, n):
    init_word = current_word
    sentence = ""
    for _ in range(n):
        encoded = t.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=23, padding="pre")
        result = model.predict_classes(encoded, verbose=0)
        for word, index in t.word_index.items():
            if index == result:
                break
        current_word = current_word + " " + word
        sentence = sentence + " " + word
    sentence = init_word + sentence
    return sentence

In [187]:
sentence_generation(model, t, "i", 10)

'i cant jump ship from facebook yet the crash one one'

In [185]:
sentence_generation(model, t, "how", 10)

'how to win an argument about guns we top we officials'

## 토지

In [None]:
path = "/content/drive/My Drive/lecture/l_d/p8/"

In [2]:
import codecs
from bs4 import BeautifulSoup


fp = codecs.open(path + 'BEXX0003.txt',
                 encoding='utf-16')
soup = BeautifulSoup(fp, 'html.parser')
body = soup.select_one('body')
text = body.getText(separator=' ')
print('코퍼스의 길이 : ', len(text))
chars = sorted(list(set(text)))
print('사용되고있는 문자의 수:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i ,c in enumerate(chars))

코퍼스의 길이 :  323265
사용되고있는 문자의 수: 1692


In [None]:
#text = text.splitlines()

In [4]:
!pip install konlpy



In [5]:
from konlpy.tag import Okt 

text = text.replace("…", "") # 현재 koNLPy가 …을 구두점으로 잡지 못하는 문제 임시 해결
    # 형태소 분석
twitter = Okt()
malist = twitter.pos(text, norm=True)
words = []
for word in malist:
    # 구두점 등은 대상에서 제외(단 마침표는 포함)
    if not word[1] in ["Punctuation"]:
        words.append(word[0])
    if word[0] == ".":
        words.append(word[0])



-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [None]:
words
s = ""
l = []
for i in words:
   
    if i != ".":
        s += i
    else:
        l.append(s)



In [None]:
l

In [11]:
from keras_preprocessing.text import Tokenizer


t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1
vocab_size

29881

In [12]:
sequences = list()

for line in text:
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i + 1]
        sequences.append(sequence)

sequences[:11]

[[8946, 3311],
 [8946, 3311, 4907],
 [117, 8947],
 [8948, 8949],
 [8950, 337],
 [8950, 337, 2],
 [8950, 337, 2, 8951],
 [8950, 337, 2, 8951, 148],
 [8950, 337, 2, 8951, 148, 1189],
 [8950, 337, 2, 8951, 148, 1189, 484],
 [8950, 337, 2, 8951, 148, 1189, 484, 8952]]

In [13]:
max_len = max(len(i) for i in sequences)
max_len

383

In [14]:
from keras.preprocessing.sequence import pad_sequences


sequences = pad_sequences(sequences, maxlen=max_len, padding="pre")
sequences[:3]

Using TensorFlow backend.


array([[   0,    0,    0, ...,    0, 8946, 3311],
       [   0,    0,    0, ..., 8946, 3311, 4907],
       [   0,    0,    0, ...,    0,  117, 8947]], dtype=int32)

In [None]:
import numpy as np


sequences = np.array(sequences)
x = sequences[:, :-1]
y = sequences[:, -1]

In [None]:
from keras.utils import to_categorical


y = to_categorical(y, num_classes=vocab_size)

In [None]:
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential


model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len - 1))
model.add(LSTM(128))
model.add(Dense(vocab_size, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam",
              metrics=["accuracy"])
model.fit(x, y, epochs=200, verbose=2)
