In [1]:
import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
#config.gpu_options.per_process_gpu_memory_fraction = 0.3
#set_session(tf.Session(config=config))
session = tf.Session(config=config)

In [2]:
from keras.datasets import imdb
from keras import models
from keras import layers
from keras import callbacks
from keras import regularizers
from keras import backend as K
from keras.wrappers import scikit_learn

from collections import Counter

from sklearn.model_selection import GridSearchCV

import numpy as np

Using TensorFlow backend.


# Classifiying movie reviews: IMDB, a binary classification example

+ 这个例子整理自[《Deep Learning with Python》](https://book.douban.com/subject/27038207/) 3.4节

## 0.探索数据

In [3]:
# 导入数据

# 只选择出现频率最多的前 10000 个单词
max_features = 10000
(x_train, y_train),(x_test, y_test) = imdb.load_data(num_words=max_features)
print('Training data shape:{}, training labels shape:{}'.format(x_train.shape, y_train.shape))
print('Test data shaep:{}, test labels shape:{}'.format(x_test.shape, y_test.shape))

Training data shape:(25000,), training labels shape:(25000,)
Test data shaep:(25000,), test labels shape:(25000,)


In [4]:
# 显示部分数据
#print('No.\t\tLength\t\tContent(the first 10 words)\t\t\t\tTarget')
print('{:<6}{:<10}{:<60}{:<10}'.format('No.', 'Length', 'Content(first 10 words)', 'Targets'))
for i, (x,y) in enumerate(zip(x_train[:100], y_train[:100])):
    target = 'Positive'if y==1 else 'Negative'
    print('{:<6}{:<10}{:<60}{:<10}'.format(i, len(x), str(x[:10]), target))

No.   Length    Content(first 10 words)                                     Targets   
0     195       [1, 103, 319, 14, 22, 13, 8033, 8, 61, 719]                 Negative  
1     211       [1, 2894, 2, 2, 9, 6, 307, 21, 5129, 22]                    Positive  
2     104       [1, 14, 9, 6, 55, 163, 20, 13, 28, 57]                      Positive  
3     120       [1, 4, 1132, 5, 2914, 26, 574, 11, 4, 644]                  Positive  
4     330       [1, 447, 4, 204, 65, 69, 55, 312, 1398, 18]                 Negative  
5     319       [1, 14, 20, 739, 8, 28, 77, 35, 23, 4]                      Negative  
6     253       [1, 146, 35, 2, 2, 5, 2, 5, 16, 1346]                       Negative  
7     80        [1, 737, 20, 261, 13, 104, 12, 69, 4, 986]                  Positive  
8     820       [1, 1065, 3184, 523, 2, 31, 7, 4, 91, 1149]                 Negative  
9     98        [1, 4, 20, 165, 47, 6, 1018, 52, 65, 21]                    Positive  
10    112       [1, 13, 66, 40, 14, 22, 54,

In [6]:
# 统计正负数据的个数、比例

# 训练数据
train_labels_count = Counter(y_train)
test_labels_count = Counter(y_test)
print('Training labels\npositive:{}, negative:{}'.format(train_labels_count[1], train_labels_count[0]))
print('Test labels\npositive:{}, negative:{}'.format(test_labels_count[1], test_labels_count[0]))

Training labels
positive:12500, negative:12500
Test labels
positive:12500, negative:12500


In [20]:
# 将数字转换为单词
word_index = imdb.get_word_index()
num_to_word = { value:key for (key, value) in word_index.items()}
print('Max word index:', max([max(sequence) for sequence in x_train]))
print('Min word index:', min([min(sequence) for sequence in x_train]))

def decode_review(num_to_word, review):
    # i - 3是因为0, 1, 2代表着'padding','start', 'unknow'， 因此单词的下标真正是从3开始的
    decoded = ' '.join( [num_to_word.get(i-3, '?') for i in review])
    return decoded

print(decode_review(num_to_word, x_train[0]))


Max word index: 9999
Min word index: 1
? i rated this movie as awful 1 after watching the trailer i thought this movie could be pretty cool guaranteed to offend everyone the trailer said well it did offend me because this movie really sucks it is hardly a comedy as i laughed about two seconds during the entire movie and what's with all the gays in this movie i'm not gay and i don't have a problem with those who are but what's the point of adding so many gay scenes in a so called comedy movie when these scenes are absolutely not funny i guess the director is a gay man in denial or something like that br br so my advice to you is if you want to waste good money go rent a good comedy you've already seen a million times you'll be better off than watching this mother of all lousy ? it really is total crap


In [29]:
# 打印出使用频率最高的一些单词
for i in range(1, 20):
    print('No.%d \t\t %s'%(i, num_to_word[i]))

No.1 		 the
No.2 		 and
No.3 		 a
No.4 		 of
No.5 		 to
No.6 		 is
No.7 		 br
No.8 		 in
No.9 		 it
No.10 		 i
No.11 		 this
No.12 		 that
No.13 		 was
No.14 		 as
No.15 		 for
No.16 		 with
No.17 		 movie
No.18 		 but
No.19 		 film


### 0.结论
+ 训练数据有25000条，训练数据的长度不是固定。测试数据25000条，测试数据长度不固定。因此需要将每条数据进行固定长度，太长的数据进行截断，太短的数据进行填充
+ 单词转为数字才能作为网络的输入，数字越小说明单词出现频率越高，但是一些高频率单词，例如the, and , a之类的并没有提供有效的信息，因此可以考虑将这部分单词过滤，数字的范围 [1, max_feature]。 
+ 样本的正负比例是1:1，无论是训练样本还是测试样本，都是12500条正样本， 12500条负样本
+ 样本的排列已经是随机状态。

## 1. 定义问题

+ 输入数据：$x\in R^n$, $n$表示长度，$x_i$是一个数字，表示一个单词，不同的$x$，n的值不同
+ 输出：$y\in\{0, 1\}$, 0表示负面评价，1表示正面评价
+ 问题归类：属于二元分类问题

## 2. 衡量指标

+ 这是一个正负比例平衡的问题，因此选择Accuracy作为衡量模型指标

## 3. 验证策略

+ 数据够多，选择hold-out验证策略

## 4. 准备数据

+ 过滤出现频率过高的单词，过滤前10%
+ 固定样本长度，过长截断，过短填充
+ 将数据向量化，有两种策略（两种都试试吧）：
    1. 将$x$转换为一个$X\in R^{M}$，M是单词最多个数，$X_i\in\{0, 1\}$，1表示数字为$i$的单词出现在$x$中
    2. 加入EMbedding层

In [3]:
# 单词个数
max_features = 10000
skips = 50

# 导入数据，设置过滤个数，单词总数
(x_train, y_train),(x_test, y_test) = imdb.load_data(num_words=max_features, skip_top=skips)

In [4]:
# 向量化（策略1）
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
        
    return results

x_train = vectorize_sequences(x_train, max_features)
x_test = vectorize_sequences(x_test, max_features)
print('x_train shape:{}, x_test shaep:{}'.format(x_train.shape, x_test.shape) )

x_train shape:(25000, 10000), x_test shaep:(25000, 10000)


In [5]:
# 目标向量化
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

## 5. 简单模型

+ 该问题的base line 是0.5。建立一个简单的模型，准确率高于0.5.

In [7]:
def build_base_model():
    model = models.Sequential()
    
    model.add(layers.Dense(16, activation='relu', input_shape=(max_features, )))
    model.add(layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    return model

# 建立模型
base_model = build_base_model()
# 回调函数
callback_list = [callbacks.EarlyStopping(monitor='val_loss', patience=8),
                callbacks.ModelCheckpoint('best_base_model.h5', save_best_only=True),
                callbacks.TensorBoard('./logs', histogram_freq=1)]
base_model.fit(x_train, y_train, epochs=20, batch_size=32, callbacks=callback_list, validation_split=0.3)

Train on 17500 samples, validate on 7500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


<keras.callbacks.History at 0x7f5cad8a9ef0>

### 5.1 结论

+ 在验证集上准确率最高83%左右，但是一开始就过拟合了。

## 6. 全面升级：开发一个过拟合的模型

+ 基本模型已经过拟合，这一步省略

## 7. 调整参数

+ 尝试Dropout、Batch Normaliztion等
+ 添加L1/L2正则化
+ 尝试不同的网络结构，添加层或者删除层
+ 尝试不同的超参数，例如神经元的个数，batch_size等等

In [18]:
def create_model(dropout=0.2, L=0.001):
    model = models.Sequential()
    
    model.add(layers.Dense(16,
                           activation='relu', kernel_initializer='he_normal', 
                           input_shape=(max_features, )))
    model.add(layers.Dropout(dropout))
    
    model.add(layers.BatchNormalization()) 
    model.add(layers.Dense(16,activation='relu', kernel_initializer='he_normal'))
    model.add(layers.Dropout(dropout))
    
    model.add(layers.BatchNormalization()) 
    #model.add(layers.Dense(16, activation='relu', kernel_initializer='he_normal'))
    #model.add(layers.Dropout(dropout))
    
    model.add(layers.Dense(1, activation='sigmoid', kernel_initializer='he_normal'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    return model

In [19]:
K.clear_session()
dropout = 0.5
L = 0.001
model = create_model(dropout, L)

# 回调函数
callback_list = [callbacks.EarlyStopping(monitor='val_loss', patience=7),
                callbacks.ModelCheckpoint('best_model.h5', save_best_only=True),
                callbacks.TensorBoard('./logs', histogram_freq=1)]

model.fit(x_train, y_train, epochs=50, batch_size=64, callbacks=callback_list, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50


<keras.callbacks.History at 0x7fa095863c18>

In [None]:
# 调整参数后，利用所有数据进行训练
model = create_model(dropout, L)
# 回调函数
model.fit(x_train, y_train, epochs=3, batch_size=64)
model.evaluate(x_test, y_test, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3

In [20]:
# 导入最优模型
model = models.load_model('best_model.h5')
model.evaluate(x_test, y_test, batch_size=64)



[0.29331848189353943, 0.87672000013351437]