In [1]:
'''Trains a LSTM on the Twitter sentiment classification task.
Notes:
- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.
- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
from __future__ import print_function
import csv
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.utils import to_categorical
import operator
import re

Using TensorFlow backend.


In [2]:
max_vocabulary = 5000 # 20000 would be better.
maxlen = 20  # Original 140. cut texts after this number of words (among top max_vocabulary most common words)
batch_size = 256

In [3]:
def preprocess_words(words):
    # Remove consecutive period symbols
    words = re.sub(r"[\. ][\. ]+", " . ", words)
    # Replace word+comma with word [space] comma
    words = re.sub(r",", " , ", words)
    # Replace word+parenthesis with word [space] parenthesis.
    # E.g., this(is) -> this ( is )
    words = re.sub(r"[\(\)]", " \1 ", words)
    return words.split()

def load_twitter(filename):
    labels = []
    tweets = []
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Column names = ItemID,Sentiment,SentimentSource,SentimentText
            labels.append(int(row['Sentiment']))
            words = row['SentimentText'].strip().lower()
            tweets.append(preprocess_words(words))
    return np.array(tweets), np.array(labels)

print('Loading data...')
tweets, labels = load_twitter('Sentiment_Analysis_Dataset.csv')
print('Loaded', len(labels), 'tweets')
print("First tweet: {}".format(' '.join(tweets[0])))
print("First tweet label: {}".format(labels[0]))

Loading data...
Loaded 1578614 tweets
First tweet: is so sad for my apl friend .
First tweet label: 0


In [4]:
# randomize tweets and create training/test sets
np.random.seed(1337)
rand_idx = np.random.permutation(len(labels))
rand_idx[:-10000]

array([1452430,  436092,  950568, ...,  144292, 1033477, 1548193])

In [5]:
# select last 10000 tweets as test set
tweets_training = tweets[rand_idx[:-10000]]
labels_training = labels[rand_idx[:-10000]]
tweets_test = tweets[rand_idx[-10000:]]
labels_test = labels[rand_idx[-10000:]]

In [6]:
def build_vocabulary(tweets):
    print('### Example tweets:')
    print(' '.join(tweets[0]))
    print(' '.join(tweets[1]))
    vocab = dict()
    for t in tweets:
        for word in t:
            if word.startswith('@'): # ignore twitter username
                continue
            if word not in vocab:
                vocab[word] = 0
            vocab[word] += 1
    # sort vocabulary by count
    vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)
    # keep only top max_vocabulary ones
    vocab = vocab[:max_vocabulary]
    vocab.append(('<unk>', 0))
    print('### Top 5 vocabs after sorting:')
    print(vocab[:5])
    return vocab

vocabulary = build_vocabulary(tweets_training)

### Example tweets:
i want food , more more food !
@sims2 i didn't . i'll download it tonight !
### Top 5 vocabs after sorting:
[('.', 2517683), ('!', 906643), ('i', 757581), ('to', 557162), ('the', 515735)]


In [7]:
def save_vocab(vocabulary):
    with open('vocab.txt', 'w') as vf:
        for v in vocabulary:
            vf.write(v[0])
            vf.write('\t')
            vf.write(str(v[1]))
            vf.write('\n')

save_vocab(vocabulary)

In [8]:
def create_vocab_index(vocab):
    vocab_idx = dict()
    v_id = 0
    for v in vocab:
        vocab_idx[v[0]] = v_id
        v_id += 1
    return vocab_idx

vocab_word_to_id = create_vocab_index(vocabulary)
vocab_id_to_word = [(idx,word) for (word,idx) in vocab_word_to_id.items()]

In [9]:
def transcode_words(sents, vocab_index):
    coded_words = [[vocab_index[w] if w in vocab_index else vocab_index['<unk>'] for w in words ] for words in sents]
    return coded_words

tweets_training_to_id = transcode_words(tweets_training, vocab_word_to_id)
tweets_test_to_id = transcode_words(tweets_test, vocab_word_to_id)

In [10]:
# must transcode from word to word_id first!!
print('Pad sequences (samples x time)')
tweets_training_to_id_padded = sequence.pad_sequences(tweets_training_to_id, maxlen=maxlen)
tweets_test_to_id_padded = sequence.pad_sequences(tweets_test_to_id, maxlen=maxlen)
print('features shape:', tweets_training_to_id_padded.shape)
# turn label to one-hot
labels_training_onehot = to_categorical(labels_training, num_classes=2)
labels_test_onehot = to_categorical(labels_test, num_classes=2)

Pad sequences (samples x time)
features shape: (1568614, 20)


In [11]:
def build_model():
    print('Building model...')
    model = Sequential()
    model.add(Embedding(max_vocabulary, 300))
    model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2, activation='sigmoid'))

    # try using different optimizers and different optimizer configs
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

def train(model):
    print('Train...')
    model.fit(tweets_training_to_id_padded, labels_training_onehot,
              batch_size=batch_size,
              epochs=10,
              validation_data=(tweets_test_to_id_padded, labels_test_onehot))
    score, acc = model.evaluate(tweets_test_to_id_padded, labels_test_onehot, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)

tweet_classify_model = build_model()
train(tweet_classify_model)
tweet_classify_model.save('tweet_model.pkl')

Building model...
Train...
Train on 1568614 samples, validate on 10000 samples
Epoch 1/10


InvalidArgumentError: indices[0,8] = 5000 is not in [0, 5000)
	 [[Node: embedding_1/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_1/embeddings/read, _recv_embedding_1_input_0)]]

Caused by op 'embedding_1/Gather', defined at:
  File "/home/ubuntu/miniconda3/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/ubuntu/miniconda3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-3f04d28f9403>", line 24, in <module>
    tweet_classify_model = build_model()
  File "<ipython-input-11-3f04d28f9403>", line 4, in build_model
    model.add(Embedding(max_vocabulary, 300))
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/keras/models.py", line 422, in add
    layer(x)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/keras/engine/topology.py", line 554, in __call__
    output = self.call(inputs, **kwargs)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/keras/layers/embeddings.py", line 119, in call
    out = K.gather(self.embeddings, inputs)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py", line 966, in gather
    return tf.gather(reference, indices)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1207, in gather
    validate_indices=validate_indices, name=name)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/ubuntu/miniconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): indices[0,8] = 5000 is not in [0, 5000)
	 [[Node: embedding_1/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_1/embeddings/read, _recv_embedding_1_input_0)]]


In [None]:
import tensorflow as tf
print(tf.__version__)

from keras.models import load_model
pre_trained_model = load_model('tweet_model.pkl')
pre_trained_model.summary()