In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU,SimpleRNN, Embedding
from keras.layers.core import Dense, Activation, Dropout
from tensorflow.keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [50]:
text = ["나는 밥을 먹었다", "나는 학교에 갔다", "오늘 학교에 선생님이 오셨다"]
text

['나는 밥을 먹었다', '나는 학교에 갔다', '오늘 학교에 선생님이 오셨다']

In [51]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text) 

print('정수인코딩: ')
print(sequences)

정수인코딩: 
[[1, 3, 4], [1, 2, 5], [6, 2, 7, 8]]


In [None]:
pri

In [52]:
# word_index는 index 부여방법은 빈도수가 높은 단어들만 사용하기 위해서 단어에 대한 빈도수를 기준으로 정렬한 뒤 높은 빈도 단어에 1, 그뒤로 2,3,4 이렇게 부여한다.
word_index = tokenizer.word_index

print("단어에 맵핑된 정수값: ")
print(word_index)
# 여기에는 '나는'이라는 단어가 제일 빈도가 높으므로 1을 부여하였다.
print(len(word_index))

단어에 맵핑된 정수값: 
{'나는': 1, '학교에': 2, '밥을': 3, '먹었다': 4, '갔다': 5, '오늘': 6, '선생님이': 7, '오셨다': 8}
8


In [53]:
data = pad_sequences(sequences, maxlen = 4) # 문장의 길이를 맞추기 위해서 0을 패딩해주었다.
data

array([[0, 1, 3, 4],
       [0, 1, 2, 5],
       [6, 2, 7, 8]])

In [20]:
data = pad_sequences(sequences, maxlen = 4, padding='post') # padding = 'post'를 사용하면 뒤에 0이 붙는다.
data

array([[1, 3, 4, 0],
       [1, 2, 5, 0],
       [6, 2, 7, 8]])

In [22]:
print(data.shape)

(3, 4)


In [24]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                 100,
                 input_length=4))
# Embedding의 파라미터는 Embedding(입력개수, 출력개수, 단어 시퀀수 수)이다.
# 우선 word_index는 1-8까지인데 패딩으로 0이 생겼다. 그럼으로 len(word_index)에 1을 더해줘서 0-9까지 사용한다.
# 두번째 출력개수는 100차원으로 출력하겠다는 뜻이다
# 세번째 인수는 시퀀수의 갯수, 즉 [1, 3, 4, 0] 처럼 다 4개로 맞췄으므로 4이다.


model.compile('rmsprop', 'mse') # model.compile(loss='mse', optimizer='rmsprop')
# RMSprop은 AdaGrad를 보완한기법이다. 

print('input array: ', data.shape)

output_array = model.predict(data)
print("output array: ", output_array.shape)

input array:  (3, 4)
output array:  (3, 4, 100)


In [26]:
print('첫번째 문장 "나는" Embedding: \n', output_array[0][0])

첫번째 문장 "나는" Embedding: 
 [-0.01384702  0.0371436   0.01095534  0.03367838  0.00748586 -0.04306557
  0.04461204  0.04957396  0.02695719 -0.04672531 -0.04706959 -0.0392246
  0.04509034  0.02460587 -0.02717716 -0.02454519 -0.02186393  0.0035964
 -0.0099218  -0.03039159  0.01345428  0.00153429 -0.04081904 -0.01948032
  0.01339186  0.03778786  0.01547382  0.01958058  0.03102026 -0.03372184
 -0.02685304 -0.00408576 -0.04285035 -0.01521815 -0.02358946  0.00223653
 -0.04906732 -0.03053992 -0.01203509  0.0148083  -0.00837088 -0.03746637
  0.0059826  -0.02687998  0.03710565 -0.04211212  0.04715941  0.03099537
  0.04364506 -0.01585461  0.01547397  0.03595937  0.00209177  0.02491119
  0.01291678  0.04078807 -0.02207683 -0.00617392 -0.04845591  0.04920919
  0.03366101  0.0016185  -0.0356267  -0.00545005  0.04531418 -0.03804404
 -0.00985862  0.00300404 -0.04989513  0.0411781   0.01996001  0.04092241
  0.026906   -0.04743264  0.02085704  0.02539755 -0.00254804 -0.04436892
  0.03695695  0.02077902 -0.

In [28]:
print('첫번째 문장 "학교에" Embedding: \n', output_array[0][1])

첫번째 문장 "학교에" Embedding: 
 [-0.04375289 -0.03394834 -0.04010032  0.04852011 -0.03512425 -0.04599524
 -0.02096882 -0.01632801  0.00378805  0.03784357  0.00971507  0.0399324
 -0.0045149   0.04046425  0.03808017 -0.02071444  0.01025813 -0.00335947
 -0.0488055   0.01820456  0.00197654 -0.00125508 -0.04811257  0.00768564
 -0.01205084 -0.00105112 -0.040073    0.01776217 -0.02583157  0.0035933
 -0.03088847 -0.03521693 -0.03329425 -0.01379777 -0.02552875  0.00383938
 -0.00686176 -0.04133471  0.01261869  0.03642048  0.01452739 -0.04712811
  0.02411289  0.0133352  -0.01863471 -0.03075097  0.02236264 -0.02244967
  0.0080909  -0.02654378  0.01276291  0.01112139  0.01231181 -0.00811839
  0.01457658 -0.03117751 -0.0009721   0.0286448  -0.04138844 -0.03082758
 -0.01873792 -0.03747505 -0.03960998  0.0116488  -0.01671425 -0.04663948
 -0.02577258  0.04251221 -0.00783583  0.02969866 -0.01357145  0.0283001
 -0.00104018 -0.01575626  0.01557704  0.03887001 -0.04591342  0.02990567
  0.04687429  0.03748536  0.

In [None]:
# 위의 결과와 같이 100차원 즉, 100개의 값으로 출력된것을 확인할 수 있음

In [27]:
output_array

array([[[-0.01384702,  0.0371436 ,  0.01095534, ...,  0.01435645,
          0.0100778 ,  0.0408392 ],
        [-0.04375289, -0.03394834, -0.04010032, ..., -0.00938404,
         -0.00490866,  0.04493202],
        [ 0.00920025,  0.00876013,  0.03634125, ..., -0.03620528,
          0.0499987 , -0.00865694],
        [-0.02692565,  0.04901146,  0.0341002 , ...,  0.04093217,
         -0.03753398,  0.04545455]],

       [[-0.01384702,  0.0371436 ,  0.01095534, ...,  0.01435645,
          0.0100778 ,  0.0408392 ],
        [ 0.02173131,  0.01843714,  0.04129163, ...,  0.03983423,
          0.00089423, -0.02362682],
        [ 0.02711215,  0.01585234, -0.01850986, ...,  0.04430768,
         -0.00190226, -0.02207126],
        [-0.02692565,  0.04901146,  0.0341002 , ...,  0.04093217,
         -0.03753398,  0.04545455]],

       [[ 0.00345943, -0.03788632, -0.04872021, ...,  0.0080614 ,
          0.0211109 , -0.00350795],
        [ 0.02173131,  0.01843714,  0.04129163, ...,  0.03983423,
          0.

In [30]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            900       
                                                                 
Total params: 900
Trainable params: 900
Non-trainable params: 0
_________________________________________________________________


In [31]:
text = ["I was wondering if anyone out there could enlighten me on this car."]

In [32]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text) 

print('정수인코딩: ')
print(sequences)

정수인코딩: 
[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]


In [33]:
# word_index는 index 부여방법은 빈도수가 높은 단어들만 사용하기 위해서 단어에 대한 빈도수를 기준으로 정렬한 뒤 높은 빈도 단어에 1, 그뒤로 2,3,4 이렇게 부여한다.
word_index = tokenizer.word_index

print("단어에 맵핑된 정수값: ")
print(word_index)
# 여기에는 '나는'이라는 단어가 제일 빈도가 높으므로 1을 부여하였다.
print(len(word_index))

단어에 맵핑된 정수값: 
{'i': 1, 'was': 2, 'wondering': 3, 'if': 4, 'anyone': 5, 'out': 6, 'there': 7, 'could': 8, 'enlighten': 9, 'me': 10, 'on': 11, 'this': 12, 'car': 13}
13


In [35]:
data = pad_sequences(sequences, maxlen = 13, padding='post') # padding = 'post'를 사용하면 뒤에 0이 붙는다.
data

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]])

In [36]:
print(data.shape)

(1, 13)


In [37]:
text = ["The Ukrainian military says Russian units are being reinforced on the approaches to Sloviansk as they prepare to resume an offensive toward the eastern city.",
"In nearby Severodonetsk, the Ukrainian military said battles continue and Russian forces had partial success in storming eastern residential areas.", 
"In a video message Friday, Ukraine's President Volodymyr Zelensky said Ukrainians have been defending their country for 100 days and victory shall be ours.",
"Facing the prospect of an extended stalemate in Ukraine, the US and its allies are placing a renewed emphasis on the need for a negotiated settlement to end the war.",
"Ukrainian troops dove under their tank, screaming at Washington Post reporters to take cover with them.", 
"Together, they pressed their bodies against the damp earth and grass, as Russian firepower rained down along this eastern front, where Moscow is concentrating its military might and inflicting massive casualties on outgunned Ukrainian forces."]



['The Ukrainian military says Russian units are being reinforced on the approaches to Sloviansk as they prepare to resume an offensive toward the eastern city.',
 'In nearby Severodonetsk, the Ukrainian military said battles continue and Russian forces had partial success in storming eastern residential areas.',
 "In a video message Friday, Ukraine's President Volodymyr Zelensky said Ukrainians have been defending their country for 100 days and victory shall be ours.",
 'Facing the prospect of an extended stalemate in Ukraine, the US and its allies are placing a renewed emphasis on the need for a negotiated settlement to end the war.',
 'Ukrainian troops dove under their tank, screaming at Washington Post reporters to take cover with them.',
 'Together, they pressed their bodies against the damp earth and grass, as Russian firepower rained down along this eastern front, where Moscow is concentrating its military might and inflicting massive casualties on outgunned Ukrainian forces.']

In [48]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text) 

print('정수인코딩: ')
print(sequences)

정수인코딩: 
[[1, 3, 6, 20, 7, 21, 12, 22, 23, 8, 1, 24, 4, 25, 13, 14, 26, 4, 27, 15, 28, 29, 1, 9, 30], [5, 31, 32, 1, 3, 6, 16, 33, 34, 2, 7, 17, 35, 36, 37, 5, 38, 9, 39, 40], [5, 10, 41, 42, 43, 44, 45, 46, 47, 16, 48, 49, 50, 51, 11, 52, 18, 53, 54, 2, 55, 56, 57, 58], [59, 1, 60, 61, 15, 62, 63, 5, 64, 1, 65, 2, 19, 66, 12, 67, 10, 68, 69, 8, 1, 70, 18, 10, 71, 72, 4, 73, 1, 74], [3, 75, 76, 77, 11, 78, 79, 80, 81, 82, 83, 4, 84, 85, 86, 87], [88, 14, 89, 11, 90, 91, 1, 92, 93, 2, 94, 13, 7, 95, 96, 97, 98, 99, 9, 100, 101, 102, 103, 104, 19, 6, 105, 2, 106, 107, 108, 8, 109, 3, 17]]
35


In [49]:
print(len(max(sequences)))

35


In [45]:
word_index = tokenizer.word_index

print("단어에 맵핑된 정수값: ")
print(word_index)

print(len(word_index))
print(len(max(word_index)))

단어에 맵핑된 정수값: 
{'the': 1, 'and': 2, 'ukrainian': 3, 'to': 4, 'in': 5, 'military': 6, 'russian': 7, 'on': 8, 'eastern': 9, 'a': 10, 'their': 11, 'are': 12, 'as': 13, 'they': 14, 'an': 15, 'said': 16, 'forces': 17, 'for': 18, 'its': 19, 'says': 20, 'units': 21, 'being': 22, 'reinforced': 23, 'approaches': 24, 'sloviansk': 25, 'prepare': 26, 'resume': 27, 'offensive': 28, 'toward': 29, 'city': 30, 'nearby': 31, 'severodonetsk': 32, 'battles': 33, 'continue': 34, 'had': 35, 'partial': 36, 'success': 37, 'storming': 38, 'residential': 39, 'areas': 40, 'video': 41, 'message': 42, 'friday': 43, "ukraine's": 44, 'president': 45, 'volodymyr': 46, 'zelensky': 47, 'ukrainians': 48, 'have': 49, 'been': 50, 'defending': 51, 'country': 52, '100': 53, 'days': 54, 'victory': 55, 'shall': 56, 'be': 57, 'ours': 58, 'facing': 59, 'prospect': 60, 'of': 61, 'extended': 62, 'stalemate': 63, 'ukraine': 64, 'us': 65, 'allies': 66, 'placing': 67, 'renewed': 68, 'emphasis': 69, 'need': 70, 'negotiated': 71, 'set

In [None]:
data = pad_sequences(sequences, maxlen = 13, padding='post') 
data