In [117]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Bidirectional, TimeDistributed
from keras.layers.recurrent import SimpleRNN
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.callbacks import EarlyStopping
import os
import tarfile
import numpy as np
import re

In [53]:
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [51]:
def read_files(filetype):
    """
    filetype: 'train' or 'test'
    return:
    all_texts: filetype数据集文本
    all_labels: filetype数据集标签
    """
    # 标签1表示正面，0表示负面
    all_labels = [1]*12500 + [0]*12500
    all_texts = []
    file_list = []
    path = r'./aclImdb/'
    # 读取正面文本名
    pos_path = path + filetype + '/pos/'
    for file in os.listdir(pos_path):
        file_list.append(pos_path+file)
    # 读取负面文本名
    neg_path = path + filetype + '/neg/'
    for file in os.listdir(neg_path):
        file_list.append(neg_path+file)
    # 将所有文本内容加到all_texts
    for file_name in file_list:
        with open(file_name, encoding='utf-8') as f:
            all_texts.append(rm_tags(" ".join(f.readlines())))
    return all_texts, all_labels

In [52]:
def preprocessing(train_texts, train_labels, test_texts, test_labels):
    tokenizer = Tokenizer(num_words=3800)  
    tokenizer.fit_on_texts(train_texts)
    # 对每一句影评文字转换为数字列表，使用每个词的编号进行编号
    x_train_seq = tokenizer.texts_to_sequences(train_texts)
    x_test_seq = tokenizer.texts_to_sequences(test_texts)
    x_train = sequence.pad_sequences(x_train_seq, maxlen=380)
    x_test = sequence.pad_sequences(x_test_seq, maxlen=380)
    y_train = np.array(train_labels)
    y_test = np.array(test_labels)
    return x_train, y_train, x_test, y_test

### RNN模型

In [210]:
def RNN(maxlen = 380, max_features = 3800, embed_size = 32):
    model = Sequential()
    model.add(Embedding(max_features, embed_size, input_length=maxlen))
    model.add(Dropout(0.5))
    model.add(SimpleRNN(16))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [211]:
model = RNN()

In [212]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_32 (Embedding)     (None, 380, 32)           121600    
_________________________________________________________________
dropout_63 (Dropout)         (None, 380, 32)           0         
_________________________________________________________________
simple_rnn_35 (SimpleRNN)    (None, 16)                784       
_________________________________________________________________
dense_36 (Dense)             (None, 256)               4352      
_________________________________________________________________
dropout_64 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_37 (Dense)             (None, 1)                 257       
Total params: 126,993
Trainable params: 126,993
Non-trainable params: 0
_________________________________________________________________


### BRNN模型

In [213]:
def BRNN(maxlen = 380, max_features = 3800, embed_size = 32):
    model = Sequential()
    model.add(Embedding(max_features, embed_size, input_length=maxlen))
    model.add(Dropout(0.5))
    model.add(Bidirectional(SimpleRNN(16, return_sequences=True), merge_mode='concat'))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

In [214]:
model = BRNN()

In [215]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_33 (Embedding)     (None, 380, 32)           121600    
_________________________________________________________________
dropout_65 (Dropout)         (None, 380, 32)           0         
_________________________________________________________________
bidirectional_29 (Bidirectio (None, 380, 16)           1568      
_________________________________________________________________
dropout_66 (Dropout)         (None, 380, 16)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 6080)              0         
_________________________________________________________________
dense_38 (Dense)             (None, 1)                 6081      
Total params: 129,249
Trainable params: 129,249
Non-trainable params: 0
_________________________________________________________________


### DBRNN

In [216]:
def DBRNN(maxlen = 380, max_features = 3800, embed_size = 32):
    model = Sequential()
    model.add(Embedding(max_features, embed_size, input_length=maxlen))
    model.add(Dropout(0.5))
    model.add(Bidirectional(SimpleRNN(16, return_sequences=True), merge_mode='concat'))
    model.add(SimpleRNN(8))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [217]:
model = DBRNN()

In [218]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 380, 32)           121600    
_________________________________________________________________
dropout_67 (Dropout)         (None, 380, 32)           0         
_________________________________________________________________
bidirectional_30 (Bidirectio (None, 380, 32)           1568      
_________________________________________________________________
simple_rnn_38 (SimpleRNN)    (None, 8)                 328       
_________________________________________________________________
dropout_68 (Dropout)         (None, 8)                 0         
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 9         
Total params: 123,505
Trainable params: 123,505
Non-trainable params: 0
_________________________________________________________________


In [221]:
model = BRNN()

In [222]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])

### 处理数据

In [57]:
if not os.path.exists('./aclImdb'):
    tfile = tarfile.open(r'./aclImdb_v1.tar.gz', 'r:gz')  # r;gz是读取gzip压缩文件
    result = tfile.extractall('./')  # 解压缩文件到当前目录中

In [58]:
train_texts, train_labels = read_files('train')
test_texts, test_labels = read_files('test')
x_train, y_train, x_test, y_test = preprocessing(train_texts, train_labels, test_texts, test_labels)

### 引入EarlyStopping，当验证集准确率不再改善时停止训练

In [219]:
es = EarlyStopping(monitor='val_loss', patience=5)

### 训练模型

In [223]:
batch_size = 64
epochs = 20
model.fit(x_train, y_train,
          validation_split=0.1,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[es],
          shuffle=True)

Train on 22500 samples, validate on 2500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


<keras.callbacks.History at 0x1db4291eda0>

### 预测测试集

In [224]:
scores = model.evaluate(x_test, y_test)



In [64]:
print('RNN:test_loss: %f, accuracy: %f' % (scores[0], scores[1]))

RNN:test_loss: 0.594139, accuracy: 0.853720


In [133]:
print('BRNN:test_loss: %f, accuracy: %f' % (scores[0], scores[1]))

BRNN:test_loss: 0.371344, accuracy: 0.867480


In [122]:
print('DBRNN:test_loss: %f, accuracy: %f' % (scores[0], scores[1]))

DBRNN:test_loss: 0.392413, accuracy: 0.851440
