说明：这里使用IMDB电影数据集，该数据集包含50000条训练和测试数据，共3个字段，id，sentiment，review，sentiment字段属于label标签，review是文本评价内容，这个评价内容最多不超过30条评价信息，sentimen的评价标准，就是打分平均低于5分，则为0，大于7分的，则为1。这里的关键就是找到一种文本内容表示的方法，常用的有词袋模型，而对于情感分析，存在前后的语义联系，单纯地使用词袋模型，是不够的，故而采用词向量表示法，同时结合双向RNN来实现。

* id - Unique ID of each review
* sentiment - Sentiment of the review; 1 for positive reviews and 0 for negative reviews
* review - Text of the review

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def loadata(file, train=True):
    if train:
        data_train = pd.read_csv(file, delimiter='\t')
        return data_train
    else:
        data_test = pd.read_csv(file, delimiter='\t')
        return data_test

In [3]:
data_train = loadata('./数据集/labeledTrainData.tsv')
data_test = loadata('./数据集/testData.tsv', train=False)

对比词向量的表示，这边打算尝试两种方法：

1. 从头开始训练，更加贴近任务本身；
2. 使用训练好的wordvec来微调词向量；

两者做一个比较！

In [4]:
x_train = data_train['review']
y_train = data_train['sentiment']

x_test = data_test['review']

In [5]:
from collections import Counter
import re

In [6]:
def build_vocab(x_train, x_test, max_features):
    x = pd.concat([x_train, x_test], ignore_index=True)
    counter = Counter()
    for i in range(len(x)):
        text = x.loc[i].replace('<br /><br />', ' ').replace("¨", "").lower()
        text = re.sub("\d+", '9', text)
        text = re.sub("((?=[\\x21-\\x7e]+)[^A-Za-z0-9])", "", text)
        text = text.strip().split(' ')
        for word in text:
            counter[word] += 1
    word2idx = {word[0]:i+2 for i, word in enumerate(counter.most_common(max_features))}
    word2idx['PAD'] = 0
    word2idx['UNK'] = 1
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word

In [7]:
def get_maxlen(x_train, x_test):
    x = pd.concat([x_train, x_test], ignore_index=True)
    sent_maxlen = 0
    for i in range(len(x)):
        text = x.loc[i].replace('<br /><br />', ' ').replace("¨", "").lower()
        text = re.sub("\d+", '9', text)
        text = re.sub("((?=[\\x21-\\x7e]+)[^A-Za-z0-9])", "", text).strip()
        sent_len = len(text.split(' '))
        if sent_len > sent_maxlen:
            sent_maxlen = sent_len
    return sent_maxlen

In [8]:
def vectorize(x_train, x_test, word2idx):
    x = pd.concat([x_train, x_test], ignore_index=True)
    input_seqs = []
    for i in range(len(x)):
        input_seq = []
        text = x.loc[i].replace('<br /><br />', ' ').replace("¨", "").lower()
        text = re.sub("\d+", '9', text)
        text = re.sub("((?=[\\x21-\\x7e]+)[^A-Za-z0-9])", "", text)
        text = text.strip().split(' ')
        for word in text:
            if word in word2idx.keys():
                input_seq.append(word2idx[word])
            else:
                input_seq.append(word2idx['UNK'])
        input_seqs.append(input_seq)
    return input_seqs

In [9]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras import layers, models, callbacks
import keras
import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
max_val_acc = []
params = []

early_stopping = callbacks.EarlyStopping(patience=50) 

for max_features in [10000, 20000, 30000, 40000, 50000]:
    word2idx, idx2word = build_vocab(x_train, x_test, max_features)
    vocab_size = min(max_features, len(word2idx)) + 2
    input_seqs = vectorize(x_train, x_test, word2idx)
    for maxlen in [100, 200, 500, 1000]:
        seqlen = min(get_maxlen(x_train, x_test), maxlen)
        X = pad_sequences(input_seqs, maxlen=seqlen)
        X_train, X_test = X[:len(x_train)], X[len(x_train):]
        Xtrain, Xval, ytrain, yval = train_test_split(X_train, y_train)
        for embeding_size in [128, 300]:
            for hidden_size1 in [64, 128, 256, 512]:
                for hidden_size2 in [64, 128, 256, 512]:
                    for dropout1 in [0.2, 0.3, 0.5, 0.8]:
                        for dropout2 in [0.2, 0.3, 0.5, 0.8]:
                            for l1 in [0.001, 0.005, 0.01, 0.02, 0.05, 0.1]:
                                for l2 in [0.001, 0.01, 0.02, 0.05, 0.1, 0.15, 0.2]:
                                    for batch_size in [100, 300, 500, 1000]:
                                        for epochs in [10, 30, 50, 100, 200]:
                                            model = models.Sequential()
                                            model.add(layers.Embedding(vocab_size, embeding_size, input_length=seqlen))
                                            model.add(layers.LSTM(hidden_size1, dropout=dropout1, recurrent_dropout=dropout2, return_sequences=True, kernel_regularizer=keras.regularizers.l1_l2(l1=l1, l2=l2)))
                                            model.add(layers.LSTM(hidden_size2, dropout=dropout1, recurrent_dropout=dropout2, kernel_regularizer=keras.regularizers.l1_l2(l1=l1, l2=l2)))
                                            model.add(layers.Dense(1, activation='sigmoid'))
                                            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
          
                                            history = model.fit(Xtrain, ytrain, validation_data=(Xval, yval), batch_size=batch_size, epochs=epochs, verbose=0, callbacks=[early_stopping])
                                            max_val_acc.append(max(history.history['val_acc']))
                                            params.append([max_features, maxlen, embeding_size, hidden_size1, hidden_size2, dropout1, dropout2, l1, l2, batch_size, epochs])


In [None]:
best_params = params[np.argmax(max_val_acc)]

word2idx, idx2word = build_vocab(x_train, x_test, best_params[0])
vocab_size = min(best_params[0], len(word2idx)) + 2
input_seqs = vectorize(x_train, x_test, word2idx)

seqlen = min(get_maxlen(x_train, x_test), best_params[1])
X = pad_sequences(input_seqs, maxlen=seqlen)
X_train, X_test = X[:len(x_train)], X[len(x_train):]
Xtrain, Xval, ytrain, yval = train_test_split(X_train, y_train)

model = models.Sequential()
model.add(layers.Embedding(vocab_size, best_params[2], input_length=seqlen))
model.add(layers.LSTM(best_params[3], dropout=best_params[5], recurrent_dropout=best_params[6], return_sequences=True, kernel_regularizer=keras.regularizers.l1_l2(l1=best_params[7], l2=best_params[8])))
model.add(layers.LSTM(best_params[4], dropout=best_params[5], recurrent_dropout=best_params[6], kernel_regularizer=keras.regularizers.l1_l2(l1=best_params[7], l2=best_params[8])))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(Xtrain, ytrain, validation_data=(Xval, yval), batch_size=best_params[9], epochs=best_params[10], verbose=0, callbacks=[early_stopping])