In [114]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [115]:
MAX_NUM_WORDS   = 10000  #15000
EMBEDDING_DIM   = 300    
MAX_SEQ_LENGTH  = 256    #500
USE_GLOVE       = True
FILTER_SIZES    = [3,4,5]
FEATURE_MAPS    = [200,200,200]
DROPOUT_RATE    = 0.4
HIDDEN_UNITS    = 200
NB_CLASSES      = 2

# LEARNING
BATCH_SIZE      = 100
NB_EPOCHS       = 10
RUNS            = 5
VAL_SIZE        = 0.2

In [116]:
imdb = keras.datasets.imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=MAX_NUM_WORDS)

In [117]:
print('Text informations:')
print("Training entries: {}, Testting entries: {}".format(len(train_data), len(test_data)))

result1 = [len(x) for x in train_data]
print('\nTrain:max length: %i / min length: %i / mean length: %i / limit length: %i' % 
      (np.max(result1),np.min(result1),np.mean(result1),MAX_SEQ_LENGTH))

result2 = [len(x) for x in test_data]
print('\nTrain:max length: %i / min length: %i / mean length: %i / limit length: %i' % 
      (np.max(result2),np.min(result2),np.mean(result2),MAX_SEQ_LENGTH))

Text informations:
Training entries: 25000, Testting entries: 25000

Train:max length: 2494 / min length: 11 / mean length: 238 / limit length: 256

Train:max length: 2315 / min length: 7 / mean length: 230 / limit length: 256


## test:数据切分(样本均衡)

In [128]:
from sklearn.model_selection import StratifiedKFold
X=np.array([
    [1,2,3,4],
    [11,12,13,14],
    [21,22,23,24],
    [31,32,33,34],
    [41,42,43,44],
    [51,52,53,54],
    [61,62,63,64],
    [71,72,73,74]
])

y=np.array([1,1,0,0,1,1,0,0])
sfolder = StratifiedKFold(n_splits=4,random_state=0,shuffle=True)

for train, test in sfolder.split(X,y):
    print('Train: %s | test: %s' % (train, test))
    print(" ")


Train: [0 1 2 3 5 7] | test: [4 6]
 
Train: [0 1 2 3 4 6] | test: [5 7]
 
Train: [0 2 4 5 6 7] | test: [1 3]
 
Train: [1 3 4 5 6 7] | test: [0 2]
 


## 将整数转换回字词

In [129]:
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()  # {word:index}

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [None]:
print(work_index[])

In [119]:
print(decode_review(train_data[0]))

<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for wha

## 准备数据

In [120]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=MAX_SEQ_LENGTH)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=MAX_SEQ_LENGTH)

In [121]:
print(decode_review(train_data[0]))

<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for wha

In [107]:
len(train_data[0]), len(train_data[1])

(256, 256)

## 构建模型

In [24]:
# input shape is the vocabulary count used for the movie reviews (10,000 words)
vocab_size = MAX_NUM_WORDS

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(optimizer=tf.train.AdamOptimizer(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

## 创建验证集

In [26]:
x_val = train_data[:10000]
partial_x_train = train_data[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

## 训练模型

In [111]:
from datetime import datetime
import math
start_time = datetime.now()

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=10,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)

end_time = datetime.now()
time_diff = (end_time - start_time).total_seconds()
minutes = math.floor(time_diff/60)
seconds = round(time_diff%60,2)
print('Total trainning time: '+ str(minutes) + 'mins ' + str(seconds) +'s (i.e. ' + str(time_diff) +'s)')


Train on 15000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Total trainning time: 0mins 5.28s (i.e. 5.28306s)


In [112]:
results = model.evaluate(test_data, test_labels)

print(results)

InvalidArgumentError: indices[29,53] = 13653 is not in [0, 10000)
	 [[{{node embedding/embedding_lookup}} = ResourceGather[Tindices=DT_INT32, dtype=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding/embeddings, embedding/Cast)]]