## 데이터 받기

In [1]:
import requests

res = requests.get('https://github.com/euphoris/datasets/raw/master/imdb.zip')

with open('imdb.zip','wb') as f:
    f.write(res.content)

In [1]:
import pandas as pd
df = pd.read_csv('imdb.zip')

In [2]:
df

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
995,I just got bored watching Jessice Lange take h...,0
996,"Unfortunately, any virtue in this film's produ...",0
997,"In a word, it is embarrassing.",0
998,Exceptionally bad!,0


## 토큰화

In [3]:
import tensorflow as tf

tk = tf.keras.preprocessing.text.Tokenizer(num_words=2000, oov_token='<unk>')

In [4]:
tk.fit_on_texts(df['review'])

In [6]:
tk.word_index

{'<unk>': 1,
 'the': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'is': 6,
 'this': 7,
 'i': 8,
 'it': 9,
 'to': 10,
 'in': 11,
 'was': 12,
 'movie': 13,
 'film': 14,
 'that': 15,
 'for': 16,
 'as': 17,
 'but': 18,
 'with': 19,
 'one': 20,
 'on': 21,
 'you': 22,
 'are': 23,
 'not': 24,
 'bad': 25,
 "it's": 26,
 'very': 27,
 'all': 28,
 'just': 29,
 'so': 30,
 'good': 31,
 'at': 32,
 'an': 33,
 'be': 34,
 'there': 35,
 'about': 36,
 'have': 37,
 'by': 38,
 'like': 39,
 'from': 40,
 'if': 41,
 'acting': 42,
 'time': 43,
 'out': 44,
 'his': 45,
 'or': 46,
 'really': 47,
 'great': 48,
 'even': 49,
 'he': 50,
 'who': 51,
 'were': 52,
 'has': 53,
 'see': 54,
 'my': 55,
 'characters': 56,
 'well': 57,
 'most': 58,
 'how': 59,
 'more': 60,
 'no': 61,
 'only': 62,
 'when': 63,
 'ever': 64,
 '10': 65,
 'movies': 66,
 'plot': 67,
 'story': 68,
 'made': 69,
 'some': 70,
 'they': 71,
 'best': 72,
 'because': 73,
 'your': 74,
 'can': 75,
 'also': 76,
 "don't": 77,
 'films': 78,
 'than': 79,
 'its': 80,
 'scrip

In [7]:
tk.word_index['good']

31

In [8]:
tk.index_word[31]

'good'

In [17]:
import joblib

joblib.dump(tk, 'tokenizer.pkl')

['tokenizer.pkl']

## 전처리

In [10]:
import joblib

tk = joblib.load('tokenizer.pkl')

In [11]:
seqs = tk.texts_to_sequences(df['review'])

In [12]:
seqs[0]

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [13]:
tk.index_word[1]

'<unk>'

In [14]:
tk.index_word[4], tk.index_word[27], tk.index_word[287]

('a', 'very', 'slow')

In [15]:
seq = seqs[0]

In [16]:
len(seq)

14

In [17]:
list(range(0, len(seq) - 4))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [18]:
data = []
for seq in seqs:
    # print("**", seq, "**")
    for i in range(0, len(seq) - 4):
        pass
        # print(seq[i:i+4], seq[i+4])
    # print("="*40)

In [19]:
data = []
for seq in seqs:
    for i in range(0, len(seq) - 4):
        data.append((seq[i:i+4], seq[i+4]))
        
data[0]

([4, 27, 27, 27], 287)

In [20]:
import random

random.shuffle(data)
data[0]

([47, 1042, 24, 11], 4)

In [21]:
import numpy as np

xs = np.array([x for x, y in data])
ys = np.array([y for x, y in data])

In [22]:
xs

array([[  47, 1042,   24,   11],
       [ 648,   85,    9,   77],
       [  55,  951,  397,  609],
       ...,
       [ 172,    1,   37,    4],
       [ 279, 1199,   71,  101],
       [ 644,    5,    7,  956]])

In [23]:
ys

array([  4, 130, 273, ..., 266, 239, 460])

In [26]:
joblib.dump((xs, ys), 'lm-data.pkl')

['lm-data.pkl']

## 학습

In [24]:
import joblib

tk = joblib.load('tokenizer.pkl')
xs, ys = joblib.load('lm-data.pkl')

In [25]:
NUM_WORD = tk.num_words + 1
# +1 하는 이유 : 패딩

In [26]:
tk.index_word

{1: '<unk>',
 2: 'the',
 3: 'and',
 4: 'a',
 5: 'of',
 6: 'is',
 7: 'this',
 8: 'i',
 9: 'it',
 10: 'to',
 11: 'in',
 12: 'was',
 13: 'movie',
 14: 'film',
 15: 'that',
 16: 'for',
 17: 'as',
 18: 'but',
 19: 'with',
 20: 'one',
 21: 'on',
 22: 'you',
 23: 'are',
 24: 'not',
 25: 'bad',
 26: "it's",
 27: 'very',
 28: 'all',
 29: 'just',
 30: 'so',
 31: 'good',
 32: 'at',
 33: 'an',
 34: 'be',
 35: 'there',
 36: 'about',
 37: 'have',
 38: 'by',
 39: 'like',
 40: 'from',
 41: 'if',
 42: 'acting',
 43: 'time',
 44: 'out',
 45: 'his',
 46: 'or',
 47: 'really',
 48: 'great',
 49: 'even',
 50: 'he',
 51: 'who',
 52: 'were',
 53: 'has',
 54: 'see',
 55: 'my',
 56: 'characters',
 57: 'well',
 58: 'most',
 59: 'how',
 60: 'more',
 61: 'no',
 62: 'only',
 63: 'when',
 64: 'ever',
 65: '10',
 66: 'movies',
 67: 'plot',
 68: 'story',
 69: 'made',
 70: 'some',
 71: 'they',
 72: 'best',
 73: 'because',
 74: 'your',
 75: 'can',
 76: 'also',
 77: "don't",
 78: 'films',
 79: 'than',
 80: 'its',
 81: 's

In [28]:
xs[0]

array([104, 163,   1,  38])

In [32]:
NUM_WORD

2001

In [29]:
# output_dim을 너무 크게 하면 성능이 좋아지지만 과적합이 일어날 수 있음  여러 시도를 통해 조절
emb1 = tf.keras.layers.Embedding(
    input_dim = NUM_WORD,
    output_dim=8
)

In [30]:
lm = tf.keras.Sequential([
    emb1, # 4단어 * 8 = 32벡터
    tf.keras.layers.GlobalAveragePooling1D(), 
    # emb1층에서 출력한 32벡터를 각 단어 벡터 위치별로 평균을 냄 : 출력은 8벡터
    tf.keras.layers.Dense(8, activation='relu'),
    # 입력 4단어에 대한 출력값은 2001(NUM_WORD)개의 벡터값으로 나옴
    tf.keras.layers.Dense(NUM_WORD)
    # tf.keras.layers.Dense(NUM_WORD-3, activation='softmax')
])

In [31]:
# embedding (Embedding)    (None, None, 8)
# 첫번째 None : 데이터의 건수 , 두번째 None : 단어의 개수(4개)
lm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 8)           16008     
                                                                 
 global_average_pooling1d (G  (None, 8)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 8)                 72        
                                                                 
 dense_1 (Dense)             (None, 2001)              18009     
                                                                 
Total params: 34,089
Trainable params: 34,089
Non-trainable params: 0
_________________________________________________________________


In [33]:
# softmax를 적용하는 두가지 방법
# 방법1 : Dense에서 적용 (tf.keras.layers.Dense(NUM_WORD, activation='softmax'))
# 방법2 : 손실함수에서 적용(compile설정시 from_logits=True값 설정)
# 텐서플로우에는 함수에서 소프트맥스를 자체 처리하는 경우들이 많이 있음
# 모델에서 처리시 나중에 추가적인 작업이 필요할 수 있음. 그래서 손실함수 처리시에 softmax를 적용하는 것이 좋음

lm.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    # loss=tf.keras.losses.CategoricalCrossentropy(),
    # SparseCategoricalCrossentropy : 학습시점의 y값은 one-hot 안해도 됨, 
    # CategoricalCrossentropy : 학습시점의 y값은 one-hot 해야함
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
'''
from tensorflow.keras.utils import to_categorical
ys_onehot = to_categorical(ys)
'''

In [34]:
lm.fit(xs, ys, epochs=1) # compile설정에서 SparseCategoricalCrossentropy를 사용할 경우
#lm.fit(xs, ys_onehot, epochs=1) # compile설정에서 CategoricalCrossentropy를 사용할 경우



<keras.callbacks.History at 0x130a2867c70>

In [35]:
lm.save('lm.krs')

INFO:tensorflow:Assets written to: lm.krs\assets


## 단어 임베딩

In [36]:
emb1

<keras.layers.core.embedding.Embedding at 0x130a25254e0>

In [37]:
emb1.embeddings

<tf.Variable 'embedding/embeddings:0' shape=(2001, 8) dtype=float32, numpy=
array([[ 0.0017763 , -0.01911992, -0.00221312, ...,  0.03297141,
         0.00495137,  0.01303538],
       [ 0.41046378,  0.43060777, -0.38337484, ...,  0.37061965,
         0.34862566,  0.3844976 ],
       [ 0.32375893,  0.3661929 , -0.3604813 , ...,  0.3352934 ,
         0.32809526,  0.34019604],
       ...,
       [-0.02781241,  0.0594218 ,  0.00724434, ...,  0.01269157,
         0.04090081,  0.06964922],
       [ 0.02197117,  0.04409891, -0.04688103, ...,  0.00060604,
        -0.00566495,  0.00893889],
       [-0.04134064,  0.03204687, -0.04843846, ..., -0.00140203,
         0.04859921, -0.00658491]], dtype=float32)>

In [38]:
e = emb1.embeddings.numpy()
e

array([[ 0.0017763 , -0.01911992, -0.00221312, ...,  0.03297141,
         0.00495137,  0.01303538],
       [ 0.41046378,  0.43060777, -0.38337484, ...,  0.37061965,
         0.34862566,  0.3844976 ],
       [ 0.32375893,  0.3661929 , -0.3604813 , ...,  0.3352934 ,
         0.32809526,  0.34019604],
       ...,
       [-0.02781241,  0.0594218 ,  0.00724434, ...,  0.01269157,
         0.04090081,  0.06964922],
       [ 0.02197117,  0.04409891, -0.04688103, ...,  0.00060604,
        -0.00566495,  0.00893889],
       [-0.04134064,  0.03204687, -0.04843846, ..., -0.00140203,
         0.04859921, -0.00658491]], dtype=float32)

In [40]:
e.shape

(2001, 8)

In [41]:
emb1.get_weights()

[array([[ 0.0017763 , -0.01911992, -0.00221312, ...,  0.03297141,
          0.00495137,  0.01303538],
        [ 0.41046378,  0.43060777, -0.38337484, ...,  0.37061965,
          0.34862566,  0.3844976 ],
        [ 0.32375893,  0.3661929 , -0.3604813 , ...,  0.3352934 ,
          0.32809526,  0.34019604],
        ...,
        [-0.02781241,  0.0594218 ,  0.00724434, ...,  0.01269157,
          0.04090081,  0.06964922],
        [ 0.02197117,  0.04409891, -0.04688103, ...,  0.00060604,
         -0.00566495,  0.00893889],
        [-0.04134064,  0.03204687, -0.04843846, ..., -0.00140203,
          0.04859921, -0.00658491]], dtype=float32)]

In [42]:
w = emb1.get_weights()[0]
w

array([[ 0.0017763 , -0.01911992, -0.00221312, ...,  0.03297141,
         0.00495137,  0.01303538],
       [ 0.41046378,  0.43060777, -0.38337484, ...,  0.37061965,
         0.34862566,  0.3844976 ],
       [ 0.32375893,  0.3661929 , -0.3604813 , ...,  0.3352934 ,
         0.32809526,  0.34019604],
       ...,
       [-0.02781241,  0.0594218 ,  0.00724434, ...,  0.01269157,
         0.04090081,  0.06964922],
       [ 0.02197117,  0.04409891, -0.04688103, ...,  0.00060604,
        -0.00566495,  0.00893889],
       [-0.04134064,  0.03204687, -0.04843846, ..., -0.00140203,
         0.04859921, -0.00658491]], dtype=float32)

In [43]:
np.array_equal(e, w)

True

In [44]:
np.savez('word-emb.npz',emb=e)

## Global Average Pooling 1D

In [45]:
import tensorflow as tf
import numpy as np

In [46]:
x = np.array([[[1,2,3],[3,6,9]]], dtype='float32')
x

array([[[1., 2., 3.],
        [3., 6., 9.]]], dtype=float32)

In [47]:
x.shape

(1, 2, 3)

In [48]:
avg = tf.keras.layers.GlobalAveragePooling1D()
avg

<keras.layers.pooling.global_average_pooling1d.GlobalAveragePooling1D at 0x130aac507c0>

In [49]:
y = avg(x).numpy()
y

array([[2., 4., 6.]], dtype=float32)

## 다음 토큰 확률 예측

In [50]:
import joblib

tk = joblib.load('tokenizer.pkl')

In [51]:
xs, ys = joblib.load('lm-data.pkl')

In [52]:
import tensorflow as tf
lm = tf.keras.models.load_model('lm.krs')

In [53]:
x = xs[0:1]
y = ys[0]

In [54]:
x

array([[104, 163,   1,  38]])

In [55]:
y

30

In [56]:
[tk.index_word[i] for i in x[0]]

['much', 'been', '<unk>', 'by']

In [57]:
tk.index_word[64]

'ever'

In [58]:
x.shape

(1, 4)

In [59]:
logit = lm.predict(x.astype('float32'))
logit.shape



(1, 2001)

In [60]:
logit, len(logit[0])

(array([[-3.791658 ,  3.8141322,  3.3130033, ..., -3.945926 , -3.803755 ,
         -3.8463213]], dtype=float32),
 2001)

In [62]:
p = tf.nn.softmax(logit).numpy()
p

array([[4.0738883e-05, 8.1876978e-02, 4.9604867e-02, ..., 3.4914938e-05,
        4.0249019e-05, 3.8571718e-05]], dtype=float32)

In [63]:
p.sum()

0.99999994

In [64]:
p[0, 64]

0.0026435675

In [65]:
i = p.argmax()
i

1

In [67]:
p[0, i]

0.08187698

In [68]:
tk.index_word[i]

'<unk>'

## 전이 학습

In [1]:
import pandas as pd

df = pd.read_csv('imdb.zip')
df

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
995,I just got bored watching Jessice Lange take h...,0
996,"Unfortunately, any virtue in this film's produ...",0
997,"In a word, it is embarrassing.",0
998,Exceptionally bad!,0


In [2]:
import joblib
tk = joblib.load('tokenizer.pkl')

In [3]:
seqs = tk.texts_to_sequences(df['review'])

In [4]:
seqs[0]

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [5]:
seqs[1]

[24, 522, 51, 12, 60, 409, 2, 736, 56, 46, 2, 337, 1220, 288, 5, 737, 738, 44]

In [7]:
from tensorflow import keras

In [10]:
import tensorflow as tf

pads = tf.keras.preprocessing.sequence.pad_sequences(seqs)

In [11]:
pads[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    4,   27,   27,   27,  287,  407, 1217,
         13,   36,    4, 1218, 1219,  408,  142])

## 단어 임베딩 불러오기

In [13]:
import numpy as np

z = np.load('word-emb.npz')
e = z['emb']
e

array([[ 0.0017763 , -0.01911992, -0.00221312, ...,  0.03297141,
         0.00495137,  0.01303538],
       [ 0.41046378,  0.43060777, -0.38337484, ...,  0.37061965,
         0.34862566,  0.3844976 ],
       [ 0.32375893,  0.3661929 , -0.3604813 , ...,  0.3352934 ,
         0.32809526,  0.34019604],
       ...,
       [-0.02781241,  0.0594218 ,  0.00724434, ...,  0.01269157,
         0.04090081,  0.06964922],
       [ 0.02197117,  0.04409891, -0.04688103, ...,  0.00060604,
        -0.00566495,  0.00893889],
       [-0.04134064,  0.03204687, -0.04843846, ..., -0.00140203,
         0.04859921, -0.00658491]], dtype=float32)

## 감성 분석

In [14]:
emb2 = tf.keras.layers.Embedding(
    input_dim=tk.num_words + 1 ,
    output_dim=8,
    embeddings_initializer=tf.keras.initializers.Constant(e)
)

In [15]:
model = tf.keras.Sequential([
    emb2,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 8)           16008     
                                                                 
 global_average_pooling1d (G  (None, 8)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 8)                 72        
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 16,089
Trainable params: 16,089
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(
    loss = 'binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [18]:
y = df['sentiment'].values

In [19]:
model.fit(pads, y)



<keras.callbacks.History at 0x133634baf50>