In [1]:
# Import the libraries
from keras.layers.core import Activation, Dense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import collections
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/linghuiwu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
maxlen = 0
word_freqs = collections.Counter()
num_recs = 0

train = pd.read_excel("LSTM/train.xlsx")
train.head()
for index, line in train.iterrows():
    label = line["label"]
    sentence = line["sentence"]
    tokens = nltk.word_tokenize(sentence.lower())
    words = [token for token in tokens if token not in stopwords.words("english")]
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        word_freqs[word] += 1
    num_recs += 1

print('max_len ', maxlen)
print('nb_words ', len(word_freqs))

Exception ignored in: <function SeekableUnicodeStreamReader.__del__ at 0x1a3fd0d200>
Traceback (most recent call last):
  File "/Users/linghuiwu/opt/anaconda3/lib/python3.7/site-packages/nltk/data.py", line 1281, in __del__
    self.close()
  File "/Users/linghuiwu/opt/anaconda3/lib/python3.7/site-packages/nltk/data.py", line 1310, in close
    self.stream.close()
KeyboardInterrupt


In [15]:
maxlen = 0
word_freqs = collections.Counter()
num_recs = 0
with open('LSTM/uci.txt', 'r+') as f:
    for line in f:
        label, sentence = line.strip().split("\t")
        tokens = nltk.word_tokenize(sentence.lower())
        words = [token for token in tokens if token not in stopwords.words('english')]
        if len(words) > maxlen:
            maxlen = len(words)
        for word in words:
            word_freqs[word] += 1
        num_recs += 1
print('max_len ', maxlen)
print('nb_words ', len(word_freqs))

# Prepare the data
MAX_FEATURES = 2000
MAX_SENTENCE_LENGTH = 40
vocab_size = min(MAX_FEATURES, len(word_freqs)) + 2
word2index = {x[0]: i + 2 for i,
              x in enumerate(word_freqs.most_common(MAX_FEATURES))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v: k for k, v in word2index.items()}
X = np.empty(num_recs, dtype=list)
y = np.zeros(num_recs)
i = 0
with open('LSTM/uci.txt', 'r+') as f:
    for line in f:
        label, sentence = line.strip().split("\t")
        words = nltk.word_tokenize(sentence.lower())
        seqs = []
        for word in words:
            if word in word2index:
                seqs.append(word2index[word])
            else:
                seqs.append(word2index["UNK"])
        X[i] = seqs
        y[i] = int(label)
        i += 1

X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# Construct neural network
EMBEDDING_SIZE = 128
HIDDEN_LAYER_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 10
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE,
                    input_length=MAX_SENTENCE_LENGTH))
model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy",
              optimizer="adam", metrics=["accuracy"])

# Train the model
model.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_data=(Xtest, ytest))

# Prediction
score, acc = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE)
print("\nTest score: %.3f, accuracy: %.3f" % (score, acc))
print('{}   {}      {}'.format('预测', '真实', '句子'))
for i in range(5):
    idx = np.random.randint(len(Xtest))
    xtest = Xtest[idx].reshape(1, 40)
    ylabel = ytest[idx]
    ypred = model.predict(xtest)[0][0]
    sent = " ".join([index2word[x] for x in xtest[0] if x != 0])
    print(' {}      {}     {}'.format(int(round(ypred)), int(ylabel), sent))

# Application
INPUT_SENTENCES = ['I love reading.', 'You are so boring.']
XX = np.empty(len(INPUT_SENTENCES), dtype=list)
i = 0
for sentence in INPUT_SENTENCES:
    tokens = nltk.word_tokenize(sentence.lower())
    words = [token for token in tokens if token not in stopwords.words('english')]
    seq = []
    for word in words:
        if word in word2index:
            seq.append(word2index[word])
        else:
            seq.append(word2index['UNK'])
    XX[i] = seq
    i += 1

XX = sequence.pad_sequences(XX, maxlen=MAX_SENTENCE_LENGTH)
labels = [int(round(x[0])) for x in model.predict(XX)]
label2word = {1: 'Postive', 0: 'Negative'}
for i in range(len(INPUT_SENTENCES)):
    print('{}   {}'.format(label2word[labels[i]], INPUT_SENTENCES[i]))

max_len  80
nb_words  6561
Train on 3528 samples, validate on 883 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test score: 0.704, accuracy: 0.832
预测   真实      句子
 0      0     UNK that came with the salad was stale .
 1      1     one of the best mexican movies ever ! , and one of the less UNK , even by mexican themselves , no matter how UNK the should have felt with it .
 0      0     that 's a huge design flaw ( unless i 'm not using it UNK , which i do n't think is the case ) .
 1      1     mission impossible 3 was awesome .
 0      1     anne UNK was utterly convincing .
积极   I love reading.
消极   You are so boring.
