## 데이터 준비

In [1]:
import pandas as pd
df = pd.read_csv('imdb.zip')

In [2]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [3]:
import joblib
tk = joblib.load('tokenizer.pkl')

In [4]:
from sklearn.model_selection import train_test_split
review_train, review_test, y_train, y_test = train_test_split(df['review'],\
                            df['sentiment'], test_size=0.2, random_state=42)

In [5]:
# 토큰화한다
seqs = tk.texts_to_sequences(review_train)

In [6]:
review_train.iloc[0]

'It is an insane game.'

In [7]:
seqs[0]

[9, 6, 33, 1258, 214]

In [8]:
seqs[:3]

[[9, 6, 33, 1258, 214],
 [178, 5, 28, 35, 23, 168, 713, 591, 3, 713, 1, 10, 1, 280],
 [206, 336, 4]]

## 순방향 순환신경망

In [9]:
import tensorflow as tf

In [10]:
# maxlen : None(기본값), 10 : 10으로 문장의 길이를 잘라서 패딩처리
# padding : pre(기본값), post(뒤쪽에 0 패딩처리), pre 권장
# truncating : pre(기본값), maxlen 값이 설정이 되야지 동작,성능 테스트 후 적당한걸 선택 
pads = tf.keras.preprocessing.sequence.pad_sequences(seqs, maxlen=None,\
                            padding='pre', truncating='pre')

In [11]:
len(pads), pads.shape

(800, (800, 73))

In [12]:
NUM_WORDS= tk.num_words + 1

## 순방향 순환신경망

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero=True),
    tf.keras.layers.LSTM(8),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 8)           16008     
                                                                 
 lstm (LSTM)                 (None, 8)                 544       
                                                                 
 dense (Dense)               (None, 1)                 9         
                                                                 
Total params: 16,561
Trainable params: 16,561
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e67e339300>

## 역방향 순환신경망

### pre padding 적용

In [17]:
# 패딩을 post로 처리하는 것이 좋음, 순방향일 경우는 pre로 처리하는 것을 권장
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero=True),
    tf.keras.layers.LSTM(8, go_backwards=True),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [18]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e67f5a2d40>

### post padding 적용

In [20]:
pads = tf.keras.preprocessing.sequence.pad_sequences(seqs, padding='post')

In [21]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero=True),
    tf.keras.layers.LSTM(8, go_backwards=True),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [22]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e60f09ead0>

## 양방향 순환신경망

In [24]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [25]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 8)           16008     
                                                                 
 bidirectional (Bidirectiona  (None, 16)               1088      
 l)                                                              
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 17,113
Trainable params: 17,113
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [30]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e61649e4d0>