In [1]:
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import collections

import nltk
import numpy as np
from keras.callbacks import TensorBoard
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import codecs

np.random.seed(42)

Using TensorFlow backend.


In [0]:
!mkdir data
!wget https://raw.githubusercontent.com/chen0040/keras-sentiment-analysis-web-api/master/demo/data/umich-sentiment-train.txt -P ./data

mkdir: cannot create directory ‘data’: File exists
--2018-11-05 05:31:38--  https://raw.githubusercontent.com/chen0040/keras-sentiment-analysis-web-api/master/demo/data/umich-sentiment-train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 447539 (437K) [text/plain]
Saving to: ‘./data/umich-sentiment-train.txt’


2018-11-05 05:31:38 (5.14 MB/s) - ‘./data/umich-sentiment-train.txt’ saved [447539/447539]



In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip -P ./data
!unzip ./data/glove.6B.zip -d ./data

--2018-11-05 05:53:04--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2018-11-05 05:53:04--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘./data/glove.6B.zip’


2018-11-05 05:54:19 (11.1 MB/s) - ‘./data/glove.6B.zip’ saved [862182613/862182613]

Archive:  ./data/glove.6B.zip
  inflating: ./data/glove.6B.50d.txt  
  inflating: ./data/glove.6B.100d.txt  
  inflating: ./data/glove.6B.200d.txt  
  inflating: ./data/glove.6B.300d.txt  


In [0]:
INPUT_FILE = "./data/umich-sentiment-train.txt"
GLOVE_MODEL = "./data/glove.6B.100d.txt"
LOG_DIR = "./logs"
VOCAB_SIZE = 5000
EMBED_SIZE = 100
BATCH_SIZE = 64
NUM_EPOCHS = 10

In [3]:
print("reading data...")
counter = collections.Counter()
with codecs.open(INPUT_FILE, "r", encoding="utf-8") as fin:
    maxlen = 0
    for line in fin:
        _, sent = line.strip().split("\t")
        try:
            words = [x.lower() for x in nltk.word_tokenize(sent)]
        except LookupError:
            print("Englisth tokenize does not downloaded. So download it.")
            nltk.download("punkt")
            words = [x.lower() for x in nltk.word_tokenize(sent)]

        if len(words) > maxlen:
            maxlen = len(words)
        for word in words:
            counter[word] += 1


print("creating vocabulary...")
word2index = collections.defaultdict(int)
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)):
    word2index[word[0]] = wid + 1
vocab_sz = len(word2index) + 1
index2word = {v: k for k, v in word2index.items()}
index2word[0] = "_UNK_"

print("creating word sequences...")
ws, ys = [], []
with codecs.open(INPUT_FILE, "r", encoding="utf-8") as fin:
    for line in fin:
        label, sent = line.strip().split("\t")
        ys.append(int(label))
        words = [x.lower() for x in nltk.word_tokenize(sent)]
        wids = [word2index[word] for word in words]
        ws.append(wids)

W = pad_sequences(ws, maxlen=maxlen)
Y = np_utils.to_categorical(ys)

# load GloVe vectors
print("loading GloVe vectors...")
word2emb = collections.defaultdict(int)
with codecs.open(GLOVE_MODEL, "r", encoding="utf-8") as fglove:
    for line in fglove:
        cols = line.strip().split()
        word = cols[0]
        embedding = np.array(cols[1:], dtype="float32")
        word2emb[word] = embedding


print("transferring embeddings...")
X = np.zeros((W.shape[0], EMBED_SIZE))
for i in range(W.shape[0]):
    E = np.zeros((EMBED_SIZE, maxlen))
    words = [index2word[wid] for wid in W[i].tolist()]
    for j in range(maxlen):
        E[:, j] = word2emb[words[j]]
    X[i, :] = np.sum(E, axis=1)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3,
                                                random_state=42)
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)

reading data...
creating vocabulary...
creating word sequences...
loading GloVe vectors...
transferring embeddings...
(4960, 100) (2126, 100) (4960, 2) (2126, 2)


In [4]:
model = Sequential()
model.add(Dense(32, input_dim=EMBED_SIZE, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(2, activation="softmax"))
model.summary()

model.compile(optimizer="adam", loss="categorical_crossentropy",
              metrics=["accuracy"])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                3232      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 3,298
Trainable params: 3,298
Non-trainable params: 0
_________________________________________________________________


In [5]:
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    callbacks=[TensorBoard(LOG_DIR)],
                    validation_data=(Xtest, Ytest))

# evaluate model
score = model.evaluate(Xtest, Ytest, verbose=1)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))

Train on 4960 samples, validate on 2126 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.099, accuracy: 0.968
