In [22]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
from keras.datasets import mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

Using TensorFlow backend.


In [3]:
train_images.shape

(60000, 28, 28)

In [4]:
# 数据预处理，将其变换为网络要求的形状，并缩放到[0, 1]之间
train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype("float32") / 255

test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype("float32") / 255

In [5]:
# 网络架构
from keras import models
from keras import layers

network = models.Sequential()
network.add(layers.Dense(512, activation="relu", input_shape=(28 * 28,)))
network.add(layers.Dense(10, activation="softmax"))

In [6]:
# 编译，compile
network.compile(optimizer="rmsprop",
                loss="categorical_crossentropy",
                metrics=["accuracy"])

In [7]:
# 准备标签
from keras.utils import to_categorical

train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

In [8]:
# fit
network.fit(train_images, train_labels, epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1750b14e9e8>

> loss: 网络在训练集上的损失  
acc: 网络在训练数据集上的accuracy

In [9]:
# 在测试集上评估性能
test_loss, test_acc = network.evaluate(test_images, test_labels)
print("Accuracy on test images: {:.4f}".format(test_acc))

Accuracy on test images: 0.9803


> 训练集accuracy为98.03%，比训练集accuracy（98.89%）低不少，**过拟合**(指在新数据上的性能往往比在训练集上要差)。

## 3.1 电影评论分类：二分类问题

+ IMDB数据集

50000条严重两极化的评论，一半用于训练，一半用于测试，分别都包含一半正面评论，一半负面评论。

已经经过预处理：评论(单词序列）已经被转化为整数序列，其中每个整数代表字典中的某个单词。

In [25]:
from keras.datasets import imdb

# num_words=10000指仅保留训练数据中前10000个最长出现的单词,低频单词将被舍弃
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
    num_words=10000)

> `train_data`, `test_data`这两个变量是评论组成的列表，每条评论又是单词索引组成的列表（表示一系列单词）。  
`train_labels`, `test_labels`都是0和1组成的列表，0代表负面，1代表正面。

In [11]:
 train_data.shape

(25000,)

In [21]:
train_data[:3]

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228,

In [18]:
# 单词索引不会超过10000，因为限定为前10000个最常见的单词
max(max(sequence) for sequence in train_data)

9999

In [35]:
train_labels

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

+ 准备数据

**将列表转换为张量**。方法有二：
1. 填充列表，使其具有相同的长度，再将列表转换成形状为(samples, word_indices)的整数张量，然后网络第一层可以使用能处理这种整数张量的层（即`Embedding`层）；
2. 对列表进行one-hot编码，将其转换为0和1组成的向量。然后网络第一层可以使用`Dense`层。

In [30]:
# 将整数序列编码为二进制矩阵
def vectorize_sequences(sequences, dimensions=10000):
    # 创建结果矩阵，shape=(len(sequences), dimensions)
    results = np.zeros((len(sequences), dimensions))
    # 该位置单词出现，则为1
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

In [31]:
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [32]:
x_train.shape

(25000, 10000)

In [33]:
x_train[0]

array([0., 1., 1., ..., 0., 0., 0.])

In [34]:
# 将标签向量化
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

In [36]:
y_train

array([1., 0., 0., ..., 0., 1., 0.], dtype=float32)

+ 构建网络

In [39]:
# 定义模型
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000, )))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

+ 编译模型

In [40]:
# 编译模型
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

可以自定义这三个参数：

In [41]:
# 配置optimizer，传入一个optimizer实例
from keras import optimizers

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [42]:
# 使用自定义loss和metrics，传入函数对象
from keras import losses
from keras import metrics

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

+ 验证

将原始训练集保留出10000个样本作为验证集：

In [43]:
x_val = x_train[:10000]
partial_x_train = x_train[10000:]

y_val = y_train[:10000]
partial_y_train = y_train[10000:]

In [45]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])

# batch_size=512, epochs=20
# validation_data监控在验证集上的loss和accuracy
history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512,
                    validation_data=(x_val, y_val))

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
