In [1]:
import requests

res = requests.get('https://github.com/euphoris/datasets/raw/master/imdb.zip')

with open('imdb.zip', 'wb') as f:
    f.write(res.content)

In [2]:
import pandas as pd
df = pd.read_csv('imdb.zip')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
import tensorflow as tf

In [6]:
tk = tf.keras.preprocessing.text.Tokenizer(num_words=2000, oov_token='<unk>')

In [7]:
tk.fit_on_texts(df['review'])

In [9]:
tk.word_index['good']

31

In [10]:
tk.index_word[31]

'good'

In [34]:
import joblib
joblib.dump(tk, 'tokenizer.pkl')

['tokenizer.pkl']

### 전처리

In [12]:
import pandas as pd
df = pd.read_csv('imdb.zip')

In [35]:
import joblib
tk = joblib.load('tokenizer.pkl')

In [36]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [37]:
seqs = tk.texts_to_sequences(df['review'])

In [38]:
seqs[0]

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [39]:
tk.index_word[407]

'moving'

In [40]:
seq = seqs[0]

In [41]:
data = []
for seq in seqs:
    for i in (range(0, len(seq) - 4)):
        data.append((seq[i:i+4], seq[i+4]))

In [42]:
data[0]

([4, 27, 27, 27], 287)

In [43]:
import random

In [44]:
random.shuffle(data)

In [45]:
data[0]

([728, 3, 1, 1093], 66)

In [46]:
import numpy as np

In [47]:
xs = np.array([x for x, y in data])
ys = np.array([y for x, y in data])

In [48]:
joblib.dump((xs, ys), 'lm-data.pkl')

['lm-data.pkl']

### 학습

In [49]:
import joblib
tk = joblib.load('tokenizer.pkl')
xs, ys = joblib.load('lm-data.pkl')

In [50]:
import tensorflow as tf

In [52]:
NUM_WORD = tk.num_words + 1    
# tk.index_word 단어숫자가 1부터 시작함 원래는 0부터 시작인데 텍스트 길이 
# 맞춰줄 떄 길이가 안맞는 경우 0을 채워서 길이를 맞춰줌

In [55]:
xs[0]

array([ 728,    3,    1, 1093])

In [61]:
emb1 = tf.keras.layers.Embedding(input_dim=NUM_WORD, output_dim=8)
lm = tf.keras.Sequential([
    emb1,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(NUM_WORD),
])

In [62]:
lm.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 8)           16008     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 8)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 2001)              18009     
Total params: 34,089
Trainable params: 34,089
Non-trainable params: 0
_________________________________________________________________


In [63]:
lm.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer = 'adam',
    metrics = ['accuracy']
)
# softmax 생략하면 이렇게 해줘야함!

In [64]:
lm.fit(xs, ys, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2629ff25b50>

In [65]:
lm.save('lm.krs')

INFO:tensorflow:Assets written to: lm.krs\assets


In [67]:
e = emb1.embeddings.numpy()

In [68]:
e.shape

(2001, 8)

In [69]:
import numpy as np

In [70]:
w = emb1.get_weights()[0]

In [71]:
np.array_equal(e, w)

True

In [72]:
np.savez('word_emb.npz', emb=2)