In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize("Hello World!, This is a dog.")

# 문자나 숫자인 경우에만 단어를 리스트에 추가한다.
words = [word for word in tokens if word.isalpha()]
print(words)

['Hello', 'World', 'This', 'is', 'a', 'dog']


### 불용어

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
print(stopwords.words('english')[:20])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


### nltk를 이용한 전처리

In [None]:
import nltk
nltk.download('punkt') # ①

from nltk.tokenize import word_tokenize # ②

text = "This is a dog." # ③
print(word_tokenize(text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['This', 'is', 'a', 'dog', '.']


In [None]:
from nltk.tokenize import sent_tokenize # ①

text = "This is a house. This is a dog."
print(sent_tokenize(text)) # ②

['This is a house.', 'This is a dog.']


In [None]:
from tensorflow.keras.preprocessing.text import *

print(text_to_word_sequence("This is a dog."))

['this', 'is', 'a', 'dog']


### 원핫인코딩

In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical

# 우리가 변환하고 싶은 텍스트
text = ["cat", "dog", "cat", "bird"]

# 단어 집합
total_pets = ["cat", "dog", "turtle", "fish", "bird"]
print("text=", text)

text= ['cat', 'dog', 'cat', 'bird']


In [None]:
# 변환에 사용되는 딕셔너리를 만든다. 
mapping = {}
for x in range(len(total_pets)):
  mapping[total_pets[x]] = x	#“cat"->0, "dog"->1, ...
print(mapping)

# 단어들을 순차적인 정수 인덱스로 만든다. 
for x in range(len(text)):
  text[x] = mapping[text[x]]

print("text=", text)

# 순차적인 정수 인덱스를 원-핫 인코딩으로 만든다. 
one_hot_encode = to_categorical(text)
print("text=", one_hot_encode)

{'cat': 0, 'dog': 1, 'turtle': 2, 'fish': 3, 'bird': 4}
text= [0, 1, 0, 4]
text= [[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]]


### 케라스에서 자연어 처리

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

t = Tokenizer()
text = """Deep learning is part of a broader family of machine learning methods
	based on artificial neural networks with representation learning."""

t.fit_on_texts([text])
print("단어집합 : ", t.word_index)

단어집합 :  {'learning': 1, 'of': 2, 'deep': 3, 'is': 4, 'part': 5, 'a': 6, 'broader': 7, 'family': 8, 'machine': 9, 'methods': 10, 'based': 11, 'on': 12, 'artificial': 13, 'neural': 14, 'networks': 15, 'with': 16, 'representation': 17}


### 텍스트의 정수 인코딩

In [None]:
seq = t.texts_to_sequences([text])[0]
print(text,"->", seq)

Deep learning is part of a broader family of machine learning methods
	based on artificial neural networks with representation learning. -> [3, 1, 4, 5, 2, 6, 7, 8, 2, 9, 1, 10, 11, 12, 13, 14, 15, 16, 17, 1]


### 샘플의 패딩


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = pad_sequences([[7, 8, 9], [1, 2, 3, 4, 5], [7]], maxlen=3, padding='pre')
print(X)


[[7 8 9]
 [3 4 5]
 [0 0 7]]


# embedding 레이어 예제

In [None]:
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

# 입력 형태: (batch_size, input_length)=(32, 3)
# 출력 형태: (None, 3, 4)
model = Sequential()
model.add(Embedding(100, 4, input_length=3))

input_array = np.random.randint(100, size=(32, 3))
model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
print(output_array.shape)

(32, 3, 4)


# 예제 : 스팸 메일 분류하기

In [None]:
import numpy as np
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

docs = [	'additional income',
		'best price',
		'big bucks',
		'cash bonus',
		'earn extra cash',
		'spring savings certificate',
		'valero gas marketing',
		'all domestic employees',
		'nominations for oct',
		'confirmation from spinner']

In [None]:
labels = np.array([1,1,1,1,1,0,0,0,0,0])

vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

[[19, 18], [8, 5], [41, 27], [14, 32], [5, 25, 14], [18, 42, 34], [7, 7, 27], [19, 21, 41], [44, 10, 18], [49, 10, 43]]


In [None]:
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[19 18  0  0]
 [ 8  5  0  0]
 [41 27  0  0]
 [14 32  0  0]
 [ 5 25 14  0]
 [18 42 34  0]
 [ 7  7 27  0]
 [19 21 41  0]
 [44 10 18  0]
 [49 10 43  0]]


In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(padded_docs, labels, epochs=50, verbose=0)

loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('정확도=', accuracy)

정확도= 1.0


In [None]:
test_doc = ['big income']
encoded_docs = [one_hot(d, vocab_size) for d in test_doc]
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

print(model.predict(padded_docs))

[[0.564517]]


# 예제 : 다음 단어 예측하기

In [None]:
import numpy as np
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

text_data="""Soft as the voice of an angel\n
Breathing a lesson unhead\n
Hope with a gentle persuasion\n
Whispers her comforting word\n
Wait till the darkness is over\n
Wait till the tempest is done\n
Hope for sunshine tomorrow\n
After the shower
"""

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_data])
encoded = tokenizer.texts_to_sequences([text_data])[0]
print(encoded)

[7, 8, 1, 9, 10, 11, 12, 13, 2, 14, 15, 3, 16, 2, 17, 18, 19, 20, 21, 22, 4, 5, 1, 23, 6, 24, 4, 5, 1, 25, 6, 26, 3, 27, 28, 29, 30, 1, 31]


In [None]:
print(tokenizer.word_index)
vocab_size = len(tokenizer.word_index) + 1
print('어휘 크기: %d' % vocab_size)

{'the': 1, 'a': 2, 'hope': 3, 'wait': 4, 'till': 5, 'is': 6, 'soft': 7, 'as': 8, 'voice': 9, 'of': 10, 'an': 11, 'angel': 12, 'breathing': 13, 'lesson': 14, 'unhead': 15, 'with': 16, 'gentle': 17, 'persuasion': 18, 'whispers': 19, 'her': 20, 'comforting': 21, 'word': 22, 'darkness': 23, 'over': 24, 'tempest': 25, 'done': 26, 'for': 27, 'sunshine': 28, 'tomorrow': 29, 'after': 30, 'shower': 31}
어휘 크기: 32


In [None]:
sequences = list()
for i in range(1, len(encoded)):
	sequence = encoded[i-1:i+1]
	sequences.append(sequence)
print(sequences)
print('총 시퀀스 개수: %d' % len(sequences))

[[7, 8], [8, 1], [1, 9], [9, 10], [10, 11], [11, 12], [12, 13], [13, 2], [2, 14], [14, 15], [15, 3], [3, 16], [16, 2], [2, 17], [17, 18], [18, 19], [19, 20], [20, 21], [21, 22], [22, 4], [4, 5], [5, 1], [1, 23], [23, 6], [6, 24], [24, 4], [4, 5], [5, 1], [1, 25], [25, 6], [6, 26], [26, 3], [3, 27], [27, 28], [28, 29], [29, 30], [30, 1], [1, 31]]
총 시퀀스 개수: 38


In [None]:
sequences = np.array(sequences)
X, y = sequences[:,0],sequences[:,1]
print("X=", X)
print("y=", y)

X= [ 7  8  1  9 10 11 12 13  2 14 15  3 16  2 17 18 19 20 21 22  4  5  1 23
  6 24  4  5  1 25  6 26  3 27 28 29 30  1]
y= [ 8  1  9 10 11 12 13  2 14 15  3 16  2 17 18 19 20 21 22  4  5  1 23  6
 24  4  5  1 25  6 26  3 27 28 29 30  1 31]


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LSTM

model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
	metrics=['accuracy'])

model.fit(X, y, epochs=500, verbose=2)


Epoch 1/500
2/2 - 2s - loss: 3.4664 - accuracy: 0.0000e+00 - 2s/epoch - 963ms/step
Epoch 2/500
2/2 - 0s - loss: 3.4650 - accuracy: 0.0263 - 11ms/epoch - 6ms/step
Epoch 3/500
2/2 - 0s - loss: 3.4640 - accuracy: 0.0789 - 9ms/epoch - 5ms/step
Epoch 4/500
2/2 - 0s - loss: 3.4629 - accuracy: 0.1053 - 14ms/epoch - 7ms/step
Epoch 5/500
2/2 - 0s - loss: 3.4619 - accuracy: 0.1053 - 12ms/epoch - 6ms/step
Epoch 6/500
2/2 - 0s - loss: 3.4610 - accuracy: 0.1053 - 8ms/epoch - 4ms/step
Epoch 7/500
2/2 - 0s - loss: 3.4599 - accuracy: 0.1053 - 13ms/epoch - 6ms/step
Epoch 8/500
2/2 - 0s - loss: 3.4591 - accuracy: 0.1053 - 9ms/epoch - 5ms/step
Epoch 9/500
2/2 - 0s - loss: 3.4579 - accuracy: 0.1053 - 8ms/epoch - 4ms/step
Epoch 10/500
2/2 - 0s - loss: 3.4570 - accuracy: 0.1053 - 10ms/epoch - 5ms/step
Epoch 11/500
2/2 - 0s - loss: 3.4560 - accuracy: 0.1053 - 9ms/epoch - 4ms/step
Epoch 12/500
2/2 - 0s - loss: 3.4550 - accuracy: 0.1053 - 8ms/epoch - 4ms/step
Epoch 13/500
2/2 - 0s - loss: 3.4539 - accuracy: 0.

<keras.callbacks.History at 0x7f911eac2390>

In [None]:
# 테스트 단어를 정수 인코딩한다. 
test_text = 'Wait'
encoded = tokenizer.texts_to_sequences([test_text])[0]
encoded = np.array(encoded)

# 신경망의 예측값을 출력해본다. 
onehot_output = model.predict(encoded)
print('onehot_output=', onehot_output)

# 가장 높은 출력을 내는 유닛을 찾는다. 
output = np.argmax(onehot_output)
print('output=', output)

# 출력층의 유닛 번호를 단어로 바꾼다. 
print(test_text, "=>", end=" ")
for word, index in tokenizer.word_index.items():
	if index == output:
		print(word)


onehot_output= [[7.3503124e-06 2.3195762e-04 3.8920805e-07 1.1207381e-02 9.4178978e-07
  9.7213507e-01 7.6105930e-03 7.8108205e-06 3.8679369e-05 3.0608478e-06
  1.5847881e-04 1.8294073e-04 5.8645605e-07 9.3817856e-04 1.3595118e-04
  2.2928625e-06 4.5383251e-05 6.8445741e-05 2.4239872e-04 5.7839119e-04
  7.0824218e-04 8.6031076e-05 4.9441149e-03 4.5253355e-06 1.2723260e-05
  4.3163063e-06 2.1998145e-05 6.1334940e-05 5.5154564e-04 5.4842531e-06
  7.3623897e-07 2.6096461e-06]]
output= 5
Wait => till


# 영화 리뷰 감성 판별하기

In [2]:
import numpy as np

import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt

In [3]:
imdb = keras.datasets.imdb
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

print(x_train[0])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 

In [4]:
# 단어 ->정수 인덱스 딕셔너리
word_to_index = imdb.get_word_index()

# 처음 몇 개의 인덱스는 특수 용도로 사용된다. 
word_to_index = {k:(v+3) for k,v in word_to_index.items()}
word_to_index["<PAD>"] = 0		# 문장을 채우는 기호
word_to_index["<START>"] = 1		# 시작을 표시
word_to_index["<UNK>"] = 2  		# 알려지지 않은 토큰 
word_to_index["<UNUSED>"] = 3

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [5]:

from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

x_train = pad_sequences(x_train, maxlen=100)
x_test = pad_sequences(x_test, maxlen=100)

vocab_size = 10000

In [6]:
model = Sequential()
model.add(Embedding(vocab_size, 64,
                    input_length=100))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           640000    
                                                                 
 flatten (Flatten)           (None, 6400)              0         
                                                                 
 dense (Dense)               (None, 64)                409664    
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,049,729
Trainable params: 1,049,729
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])
history = model.fit(x_train, y_train,
          batch_size=64, epochs=20, verbose=1,
          validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [8]:
results = model.evaluate(x_test, y_test, verbose=2)
print(results)

782/782 - 1s - loss: 0.9034 - accuracy: 0.8196 - 1s/epoch - 2ms/step
[0.9033842086791992, 0.819599986076355]


In [9]:
review = "What can I say about this movie that was already said? It is my favorite time travel sci-fi, adventure epic comedy in the 80's and I love this movie to death! When I saw this movie I was thrown out by its theme. An excellent sci-fi, adventure epic, I LOVE the 80s. It's simple the greatest time travel movie ever happened in the history of world cinema. I love this movie to death, I love, LOVE, love it!"

In [10]:
import re
review = re.sub("[^0-9a-zA-Z ]", "", review).lower()

review_encoding = []
# 리뷰의 각 단어 대하여 반복한다. 
for w in review.split():
		index = word_to_index.get(w, 2)	# 딕셔너리에 없으면 2 반환
		if index <= 10000:		# 단어의 개수는 10000이하
			review_encoding.append(index)
		else:
			review_encoding.append(word_to_index["UNK"])

# 2차원 리스트로 전달하여야 한다. 
test_input = pad_sequences([review_encoding], maxlen = 100) 
value = model.predict(test_input) # 예측
if(value > 0.5):
	print("긍정적인 리뷰입니다.")
else:
	print("부정적인 리뷰입니다.")

긍정적인 리뷰입니다.
