In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.18.1
pandas 1.0.0
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


## 处理数据

In [None]:
imdb = keras.datasets.imdb
vocab_size = 10000
index_from = 3

# 载入数据
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size, index_from=index_from) # num_words设置词表数量，前vocab_size个会保留下来，后面的当成特殊字符处理


In [None]:
print(train_data[0], train_labels[0])
print(train_data.shape, train_labels.shape)
print(len(train_data[0]), len(train_data[1])) # 变长数据

In [None]:
# 载入词表
word_index = imdb.get_word_index()
print(len(word_index))
print(type(word_index))
print(word_index)

In [None]:
word_index = {k:(v+3) for k, v in word_index.items()}  # word_index中每个词的index都增加3(因为index_from设为3)

In [None]:
# 前几个index用作特殊字符
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['END'] = 3

reverse_word_index = dict([(value, key) for key, value in word_index.items()])

def decode_review(text_ids):
    return ' '.join([reverse_word_index.get(word_id, '<UNK>') for word_id in text_ids])

decode_review(train_data[0])

### 数据补全和截断

In [None]:
max_length = 500
train_data = keras.preprocessing.sequence.pad_sequences(train_data, 
                                                        value=word_index['<PAD>'],  # 填充的值
                                                        padding='post', # padding = ['post', 'pre'] post: 把padding放在句子后面，pre: 把padding放在句子前面
                                                        maxlen = max_length
                                                        )
test_data = keras.preprocessing.sequence.pad_sequences(test_data, 
                                                        value=word_index['<PAD>'],  # 填充的值
                                                        padding='post', # padding = ['post', 'pre'] post: 把padding放在句子后面，pre: 把padding放在句子前面
                                                        maxlen = max_length
                                                        )
print(train_data[0])

In [None]:
embedding_dim = 16
batch_size = 128
epochs = 5
model = keras.models.Sequential([
    # embedding层：
    # 1.定义一个矩阵[vacab_size, embedding_dim] 来定义每个词的词向量
    # 2.对于每个句子分别对每个词做embedding -> [batch_size, max_length, embedding_dim]的一个矩阵
    keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    # # 单层单向RNN
    # keras.layers.LSTM(units=64, return_sequences=False),   # return_sequence: 最后一个RNN设False，否则设True
    # 多层双向RNN
    keras.layers.Bidirectional(keras.layers.LSTM(units=64, return_sequences=True)), # 双向RNN
    keras.layers.Bidirectional(keras.layers.LSTM(units=64, return_sequences=False)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(train_data, train_labels, epochs=epochs, batch_size=batch_size, validation_split=0.2)  # validation_split多少比例的训练数据被当做验证集

In [None]:
def plot_learning_curves(history, label, epochs, min_value, max_value):
    data = {}
    data[label] = history.history[label]
    data['val_'+label] = history.history['val_'+label]
    pd.DataFrame(data).plot(figsize=(8, 5))
    plt.grid(True)
    plt.axis([0, epochs, min_value, max_value])
    plt.show()

plot_learning_curves(history, 'accuracy', epochs, 0, 1.5)
plot_learning_curves(history, 'loss', epochs, 0, 1.5)

In [None]:
model.evaluate(test_data, test_labels, batch_size=batch_size)