https://mp.weixin.qq.com/s?__biz=MzIwODI2NDkxNQ==&mid=2247489340&idx=3&sn=197a57a12954ad4284ab083488342397&chksm=97049c80a0731596d7579466c6844931cd41b8bcb101c7ffb076abfe32c767cc506d6975c943&scene=126&sessionid=1593141601&key=f9077bed3f45e74f988ce8319873edc5b241c70876b516087171441cd121edc0f830c3d617bca92a8674bd3164edf3c117d5e0090900dac7877e8ef6292633619a760a35bfa19296d43ae0bcbf81d337&ascene=1&uin=MjA1MjAyODkxNg%3D%3D&devicetype=Windows+10+x64&version=6209051a&lang=zh_CN&exportkey=AUH%2B%2Fy8DNYIDM%2F0pFJuceXI%3D&pass_ticket=nkufbuZ2D6vSURECYdsr2t3GiiqSA3OjPYMwQpdvEoQBiuvtCxGyPYv9wNqyLZP7

#### 数据下载

In [1]:
import tensorflow as tf

# 下载IMDB数据
vocab_size = 10000 # 保留词的个数
imdb = tf.keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)
print("train len:", len(train_data))    # [25000]
print("test len:", len(test_data))    # [25000]

train len: 25000
test len: 25000


#### 重构词的索引

In [2]:
# 一个将单词映射到整数索引的词典
word_index = imdb.get_word_index()
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

#### 简单预处理

In [3]:
# 统一文本序列长度
train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data,value=word_index["<PAD>"],padding="post",truncating="post",maxlen=256)
test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data,value=word_index["<PAD>"],padding="post",truncating="post",maxlen=256)

#### 模型构建

In [4]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,16),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [5]:
class MyModel(tf.keras.models.Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, 16)
        self.g_avg_pool = tf.keras.layers.GlobalAveragePooling1D()
        self.d1 = tf.keras.layers.Dense(16, activation="relu")
        self.d2 = tf.keras.layers.Dense(1, activation="sigmoid")

    def call(self, inputs, training=None, mask=None):
        # inputs: [batch_size, seq_len]
        x = self.embedding(inputs)    # [batch_size, seq_len, 16]
        x = self.g_avg_pool(x)    # [batch_size, 16]
        x = self.d1(x)    # [batch_size, 16]
        x = self.d2(x)    # [batch_size, 1]]
        return x

#### 模型训练与评估

In [6]:
# 配置模型训练参数
model.compile(optimizer=tf.keras.optimizers.Adam(),loss=tf.keras.losses.BinaryCrossentropy(),metrics=[tf.keras.metrics.BinaryAccuracy()])
# 训练模型
history = model.fit(train_data,train_labels,epochs=40,batch_size=512)
# 评估测试集
model.evaluate(test_data,test_labels,verbose=2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
782/782 - 0s - loss: 0.4249 - binary_accuracy: 0.8554


[0.4248790740966797, 0.855400025844574]

#### 模型的保存与加载

In [7]:
# 保存权重
model.save_weights("../dataset/my_checkpoint")
# 加载权重
new_model = MyModel()
# 预测之前需要先编译
new_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
new_model.load_weights("../dataset/my_checkpoint")
# 评估测试集
new_model.evaluate(test_data,test_labels,verbose=2)

782/782 - 1s - loss: 0.6931 - accuracy: 0.5066


[0.6931140422821045, 0.506600022315979]

#### hdf5方式

In [8]:
# 只能用户functional model or a sequential model，目前不能用于subclass model
# 保存模型
model.save("../dataset/my_model.h5")
# 加载模型
# 重新创建完全相同的模型，包括其权重和优化程序
new_model = tf.keras.models.load_model("../dataset/my_model.h5")
# 显示网格结构
new_model.summary()
# 评估测试集
new_model.evaluate(test_data,test_labels,verbose=2)

W0715 15:54:47.591032 31260 hdf5_format.py:201] Error in loading the saved optimizer state. As a result, your model is starting with a freshly initialized optimizer.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________
782/782 - 0s - loss: 0.4249 - binary_accuracy: 0.8554


[0.4248790740966797, 0.855400025844574]

#### saved_model

In [None]:
# 保存模型
tf.saved_model.save(model,'../dataset/test_model')
# 加载模型
new_model = tf.saved_model.load('../dataset/test_model')
# 预测结果
result = new_model(test_data)