# IMDb dataset for sentiment classification

- 영화 평가 사이트 IMDb의 리뷰를 바탕으로 만들어짐 (http://www.imdb.com/interfaces/)
- sentiment classification을 위한 대표적인 벤치마크 데이터
- training data와 test data 각각 25,000개의 리뷰가 담겨 있음

![](https://iksinc.files.wordpress.com/2015/09/image.png)

## Load modules

In [1]:
import numpy as np
import tensorflow as tf

tf.random.set_seed(0)

## Load data

In [2]:
from tensorflow.keras.datasets import imdb

maxlen = 200
num_words = 8000

(X_trn, y_trn), (X_tst, y_tst) = imdb.load_data(
    path='imdb.pkl',
    num_words=num_words,
    skip_top=0,
    maxlen=maxlen,
    seed=0, #113
    start_char=1,
    oov_char=2,
    index_from=3)

print("Training Set:   {} samples".format(len(X_trn)))
print("Test Set:       {} samples".format(len(X_tst)))
print()
print("An example of sequence and its label:")
print('- sequence:', X_trn[0])
print('- label:', y_trn[0])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Training Set:   25000 samples
Test Set:       3913 samples

An example of sequence and its label:
- sequence: [1, 14, 9, 31, 7, 148, 102, 198, 269, 8, 30, 4378, 5, 3094, 5, 305, 630, 56, 2, 32, 120, 410, 260, 110, 12, 33, 6, 2, 22, 1413, 13, 16, 3704, 34, 4, 185, 1170, 2, 825, 355, 901, 56, 190, 120, 32, 1054, 56, 179, 685, 10, 10, 45, 254, 8, 6167, 6, 283, 65, 237, 225, 24, 76, 15, 70, 30, 224, 44, 4, 114, 21, 13, 258, 14, 4229, 3650, 5, 5028, 2279, 45, 465, 5, 220, 2950, 3370, 6, 5503, 948, 3174, 7, 4, 4039, 19, 2, 228, 5, 2, 491, 1969, 12, 43, 152, 157, 49, 139, 121, 38, 954, 15, 305, 7, 2, 4299, 61, 311, 16, 2, 2, 5, 2660, 523, 10, 10, 4, 65, 47, 35, 221, 863, 21, 14, 43, 2, 2, 83, 6, 465, 4309, 7941]
- label: 0


#### [참고] word indexing

In [3]:
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+3) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}

print(' '.join(id_to_word[i] for i in X_trn[0]))
print(y_trn[0])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
<START> this is one of those movies that's trying to be moody and tense and instead ends up <UNK> all over itself having seen it at a <UNK> film festival i was intrigued by the young college <UNK> gone wrong write up however over all ended up quite disappointed br br it's hard to critique a true story since there's not much that can be done about the plot but i found this disjointed melodramatic and wholly depressing it's dark and almost sinister painting a darn creepy flash of the seventies with <UNK> music and <UNK> close ups it just doesn't work some scenes where so cheesy that instead of <UNK> awe my audience was <UNK> <UNK> and rolling eyes br br the story has an interesting premise but this just <UNK> <UNK> into a dark miserable spiral
0


# Preprocess data

training set에서 보지 못한 단어를 test set으로부터 삭제

In [4]:
def get_vocabulary_size(X):
    """
    input (X): [num_document, document_size (variable_length)]
    output   : vocabulary size
    """
    return max([max(doc) for doc in X]) + 1  # plus the 0th word

def fit_in_vocabulary(X, voc_size):
    """
    convert the index of OOV (out-of-vocabulary) word to "2"
    input (X): [num_document, document_size (variable_length)]
    output   : [num_document, document_size (variable_length)]
    """
    for i, doc in enumerate(X):
        X[i] = [w if w < voc_size else 2 for w in doc]
    return X

vocabulary_size = get_vocabulary_size(X_trn)
X_tst = fit_in_vocabulary(X_tst, vocabulary_size)
print('vocabulary_size', vocabulary_size)

vocabulary_size 8000


zero-padding: 모든 sequence의 길이를 동일하게 만들어줌

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

print('Pad sequences (samples x time)')
X_trn = pad_sequences(X_trn, maxlen=maxlen, padding='post')
X_tst = pad_sequences(X_tst, maxlen=maxlen, padding='post')
print('x_train shape:', X_trn.shape)
print('x_test shape:', X_tst.shape)

Pad sequences (samples x time)
x_train shape: (25000, 200)
x_test shape: (3913, 200)


batch dataset 생성

In [8]:
batch_size = 256
train_dataset = (tf.data.Dataset.from_tensor_slices((X_trn, y_trn))
                 .shuffle(len(X_trn)).batch(batch_size))
test_dataset = (tf.data.Dataset.from_tensor_slices((X_tst, y_tst))
                .batch(batch_size))

# Build graph

![](https://image.slidesharecdn.com/l07nnrnngrulstm-151108140716-lva1-app6892/95/recurrent-neural-networks-lstm-and-gru-14-638.jpg?cb=1446992496)

We need many-to-one structure!

![](http://deeplearning.net/tutorial/_images/lstm.png)

In [9]:
from tensorflow.keras import Model, layers

# Network Parameters
num_input = num_words # number of sequences.
timesteps = maxlen    # timesteps.
embedding_dim = 16 # embedding dimention of one-hot vectors
num_units = 32 # number of neurons for the LSTM layer.

In [10]:
# Create LSTM Model.
class LSTM(Model):
    # Set layers.
    def __init__(self):
        super(LSTM, self).__init__()
        self.embedding_layer = layers.Embedding(input_dim=vocabulary_size, 
                                                output_dim=embedding_dim,
                                                mask_zero=True)
        self.lstm_layer = layers.LSTM(units=num_units, 
                                      return_sequences=True,
                                      zero_output_for_mask=True)
        self.out = layers.Dense(1, activation='sigmoid')

    # Set forward pass.
    def call(self, x, is_training=False):
        # Embedding layer
        # input=[batch_size, maxlen, vocabulary_size]
        # output=[batch_size, maxlen, embedding_dim]
        x_embedded = self.embedding_layer(x)
        
        # mask for zero-padded parts
        mask = self.embedding_layer.compute_mask(x)
        
        # LSTM layer.
        # input=[batch_size, maxlen, embedding_dim]
        # output=[batch_size, maxlen, num_units]
        rnn_outputs = self.lstm_layer(x_embedded, mask=mask)
        
        # average outputs over time axis
        # input=[batch_size, maxlen, num_units]
        # output=[batch_size, num_units]
        casted_mask = tf.cast(mask, tf.float32)
        masked_outputs = tf.multiply(rnn_outputs, 
                                     tf.expand_dims(casted_mask, 2))
        lengths = tf.reduce_sum(casted_mask, axis=1)
        average = tf.divide(tf.reduce_sum(masked_outputs, axis=1), 
                            tf.expand_dims(lengths, 1))
        
        # Output layer
        output = self.out(average)
        
        return output

## Train and Test

In [11]:
batch_size = 256
epochs = 10

model = LSTM()
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
history = model.fit(train_dataset, epochs=epochs,
                    validation_data=test_dataset)

test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.5010825004428625
Test Accuracy: 0.8466649651527405


In [12]:
# Create LSTM Model.
class BidirectionalLSTM(LSTM):
    # Set layers.
    def __init__(self):
        super(BidirectionalLSTM, self).__init__()
        self.lstm_layer = layers.Bidirectional(self.lstm_layer, merge_mode='sum')

In [13]:
batch_size = 256
epochs = 10

model = BidirectionalLSTM()
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
history = model.fit(train_dataset, epochs=epochs,
                    validation_data=test_dataset)

test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.40853938832879066
Test Accuracy: 0.8832098245620728


## Check the sentiment of your review!

In [14]:
def get_sentiment_score(review_text):
    review_seq = review_text.split(' ')
    data = [[word_to_id[w] for w in review_seq]]
    data = fit_in_vocabulary(data, vocabulary_size)

    padded_data = pad_sequences(data, maxlen=maxlen, padding='post')
    y = model.predict(padded_data)
    return y.flatten()

review_text = 'totally predictable but good for weekend'
# review_text = 'how lovely the actress'
# review_text = 'good soundtrack'
# review_text = 'good ost'
# review_text = 'horrible'

y = get_sentiment_score(review_text)
print(review_text)
print('==> sentment score', y.round(2), '--',
      'good' if y > 0.5 else 'bad', 'movie!')

totally predictable but good for weekend
==> sentment score [0.23] -- bad movie!
