# 신경망 언어모형 실습: Tokenizer

In [1]:
# My Google Drive Mount하기!
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd

df = pd.read_csv('https://github.com/euphoris/datasets/raw/master/imdb.zip')
print(df.shape)

(1000, 2)


In [3]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer

# 상위 2000개의 단어까지만 번호로 변환하고, 나머지는 UNK으로 취급한다.
tokenizer = Tokenizer(
    num_words=2000,
    oov_token='<unk>'
)

In [5]:
# 단어에 번호를 붙인다.
tokenizer.fit_on_texts(df['review'])

In [6]:
# 'good'의 번호는?
tokenizer.word_index['good']

31

In [7]:
# '31'번째 번호를 가진 단어는?
tokenizer.index_word[31]

'good'

In [8]:
# tokenizer 저장한다.
import joblib

FolderPath = '/content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data'

joblib.dump(
    tokenizer,
    FolderPath + '/' + 'tokenizer.pkl'
)

['/content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data/tokenizer.pkl']

# 신경망 언어모형 실습: 전처리

In [9]:
tokenizer = joblib.load(
    FolderPath + '/' + 'tokenizer.pkl'
)

In [11]:
# 언어 모형에 맞게 데이터 정리

seq_arr = tokenizer.texts_to_sequences(df['review'])

data_list = []

for seq in seq_arr:
    for idx in range(0, len(seq) - 4):
        data_list.append(
            (seq[idx : idx + 4], seq[idx + 4])
        )

In [12]:
# 데이터를 섞어준다.
import random

random.shuffle(data_list)
data_list[0]

([4, 1, 195, 5], 31)

In [13]:
# x와 y로 데이터를 나눈다.

import numpy as np

data_input = np.array([x for x, y in data_list])
data_target = np.array([y for x, y in data_list])

print(len(data_input))
print(len(data_target))

10566
10566


# 신경망 언어모형 실습: 학습

In [14]:
# 언어 모형

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

# 단어 번호가 1번부터 붙으므로, 0번까지 포함하면 총 단어 수에 1을 더해야 한다.
word_cnt = tokenizer.num_words + 1

embedding = Embedding(
    input_dim=word_cnt,
    output_dim=6
)

model = Sequential()

model.add(embedding)
model.add(GlobalAveragePooling1D())
model.add(Dense(8, activation='relu'))
model.add(Dense(word_cnt))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 6)           12006     
_________________________________________________________________
global_average_pooling1d (Gl (None, 6)                 0         
_________________________________________________________________
dense (Dense)                (None, 8)                 56        
_________________________________________________________________
dense_1 (Dense)              (None, 2001)              18009     
Total params: 30,071
Trainable params: 30,071
Non-trainable params: 0
_________________________________________________________________


In [15]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
    loss=SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['acc']
)

model.fit(
    data_input,
    data_target,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd1c6419ad0>

In [16]:
# 모형 저장한다.
FolderPath = '/content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data'

model.save(FolderPath + '/' + 'lm.krs')

INFO:tensorflow:Assets written to: /content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data/lm.krs/assets


In [17]:
# 단어 Embedding

Embedding_arr = embedding.embeddings.numpy()
print(Embedding_arr.shape)

(2001, 6)


In [18]:
# 단어 Embedding은 Embedding Layer의 Weights와 동일하다.

import numpy as np

EmbeddingWeight_arr = embedding.get_weights()[0]
print(np.array_equal(Embedding_arr, EmbeddingWeight_arr))

True


In [19]:
# Embedding을 저장한다.
np.savez(FolderPath + '/' + 'word-emb.npz', embedding=Embedding_arr)

# GlobalAveragePooling1D

In [20]:
import numpy as np

# GlobalAveragePooling1D: 1번 index를 기준으로 평균을 구한다.

# 사례

x = np.array([
    [
        [1,2,3],
        [3,6,9]
    ]
], dtype='float32')

print(x.shape)

(1, 2, 3)


In [21]:
# 이 Array를 GlobalAveragePooling1D Layer에 통과시키면 다음과 같이 된다.

from tensorflow.keras.layers import GlobalAveragePooling1D

pooling = GlobalAveragePooling1D()

y = pooling(x).numpy()
y

array([[2., 4., 6.]], dtype=float32)

In [22]:
y.shape

(1, 3)

# 신경망 언어모형 실습: 다음 토큰의 확률 예측

In [23]:
from tensorflow.keras.models import load_model

model = load_model(FolderPath + '/' + 'lm.krs')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 6)           12006     
_________________________________________________________________
global_average_pooling1d (Gl (None, 6)                 0         
_________________________________________________________________
dense (Dense)                (None, 8)                 56        
_________________________________________________________________
dense_1 (Dense)              (None, 2001)              18009     
Total params: 30,071
Trainable params: 30,071
Non-trainable params: 0
_________________________________________________________________


In [24]:
# 다음에 나올 단어의 확률 예측

x = data_input[0:1]
y = data_target[0]

# x의 4 단어를 확인한다.

print([tokenizer.index_word[eachwordidx] for eachwordidx in x[0]])

['a', '<unk>', 'feeling', 'of']


In [25]:
# 모형에 넣는다.

import numpy as np

logit = model.predict(x.astype('float32'))
print(logit)

[[-7.998441   3.9851387  3.5088015 ... -8.026324  -8.116435  -8.04873  ]]


In [26]:
# softmax function을 통해 확률로 바꾼다.
from tensorflow.nn import softmax

logit_softmax = softmax(logit).numpy()
print(logit_softmax)

[[5.7843022e-07 9.2609063e-02 5.7515226e-02 ... 5.6252429e-07
  5.1405158e-07 5.5006080e-07]]


In [27]:
# 여기에서 실제로 나온 단어를 확인한다.

tokenizer.index_word[y]

'good'

In [28]:
# 해당 단어의 확률을 본다.

print(logit_softmax[0, y])

0.004200144


In [29]:
# 확률이 가장 높은 단어를 알아본다.

x_pred = logit_softmax.argmax()
x_pred

1

In [30]:
logit_softmax[0, x_pred]

0.09260906

In [31]:
tokenizer.index_word[x_pred]

'<unk>'

# 신경망 언어모형 실습: Transfer Learning

In [32]:
import pandas as pd

df = pd.read_csv('https://github.com/euphoris/datasets/raw/master/imdb.zip')
print(df.shape)

(1000, 2)


In [33]:
# 기존의 Tokenizer를 불러온다.

import joblib

tokenizer = joblib.load(FolderPath + '/' + 'tokenizer.pkl')

In [35]:
# Text를 Token의 번호 Sequence로 바꾼다.

seq_arr = tokenizer.texts_to_sequences(df['review'])
print(len(seq_arr))

1000


In [36]:
# Sequence마다 길이가 모두 다르므로, 앞에 0을 채워(padding) 길이를 맞춰준다.

from tensorflow.keras.preprocessing.sequence import pad_sequences

seq_arr_padding = pad_sequences(seq_arr)
print(len(seq_arr_padding))

1000


In [38]:
# 단어 Embedding 불러오기

import numpy as np

z = np.load(FolderPath + '/' + 'word-emb.npz')
Embedding_arr = z['embedding']

In [39]:
# 감성 분석

# 감성 분석 모형에 들어갈 Embedding Layer를 만든다. 언어 모형에서 학습된 Weights로 초기화한다.

from tensorflow.keras.initializers import Constant

embedding = Embedding(
    input_dim=tokenizer.num_words + 1,
    output_dim=6,
    embeddings_initializer=Constant(Embedding_arr)
)

In [40]:
# 감성 분석 모형을 만든다.

model = Sequential()

model.add(embedding)
model.add(GlobalAveragePooling1D())
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 6)           12006     
_________________________________________________________________
global_average_pooling1d_2 ( (None, 6)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 56        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
Total params: 12,071
Trainable params: 12,071
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['acc']
)

model.fit(
    seq_arr_padding,
    df['sentiment'].values
)



<keras.callbacks.History at 0x7fd2140fb550>