In [4]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
# import contractions
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

  'Matplotlib is building the font cache using fc-list. '


In [3]:
tf.__version__

'1.8.0'

# Prepare Input Data

In [None]:
reviews_labeled = pd.read_feather('../data/reviews_labeled.feather')

In [19]:
reviews_labeled.shape

(136002, 2)

Note that we have an equal number of positive and negative labeled reviews:

In [37]:
reviews_labeled.is_positive.value_counts()

1    68001
0    68001
Name: is_positive, dtype: int64

In [20]:
reviews_labeled.iloc[0]        # sample record

text           This is nothing like Chipotle, the food taste ...
is_positive                                                    1
Name: 0, dtype: object

In [21]:
reviews_labeled.text.iloc[0]   # sample review text

"This is nothing like Chipotle, the food taste way better, the quality of the food is great. This is the perfect example of eating in a moms pops restaurant. The atmosphere is awesome, the service is great. If you are looking for good Mexican food this is the place to go, you will not be disappointed. I would go out of my way to eat here that's for sure. I ordered the veggie bowl, since I am vegetarian, best bowl ever. It last me two days."

We need to break down the text from reviews from its present form, which is a string, to an ordered list of words.  These words are 'features' of each review, and must later be converted into a numerical representation that the neural network can work with.

## Replace Contractions

This can be considered to be an optional step.  This replaces common word contractions - e.g. `doesn't` is replaced by `does not`.  We use a module called `contractions` (easily installed via `pip install contractions`) that maintains a list of common English contractions and their corresponding expanded versions.

Here's a sample usage of the `contractions` module:


In [24]:
sentence = "He doesn't know how they've done the job.  I won't allow it."
print('Original sentence: ', sentence)
print('Fixed sentence:    ', contractions.fix(sentence))

Original sentence:  He doesn't know how they've done the job.  I won't allow it.
Fixed sentence:     He does not know how they have done the job.  I will not allow it.


In [22]:
# create a new column with text that has contractions expanded
reviews_labeled['text_fixed'] = reviews_labeled.text.apply(contractions.fix)

Here's a sample comparison between an original review and the version with no contractions.  Words like `don't`, `couldn't`, `I've` have been expanded to their full forms.

In [38]:
reviews_labeled.text.iloc[3]

"I don't normally give five stars unless everything was PERFECT, but I truly couldn't find a single thing to complain about! Service was great, burgers were huge and one of the best I've ever had! $14.95 for a gigantic burger and fries in LV is very affordable! To start, they brought out a big ol' biscuit with honey butter sauce on top that was incredible!! Not long after, our burgers came out. Water was always full, we sat down right away, and it was air conditioned!! Definitely recommend!!"

In [39]:
reviews_labeled.text_fixed.iloc[3]

"I do not normally give five stars unless everything was PERFECT, but I truly could not find a single thing to complain about! Service was great, burgers we are huge and one of the best I have ever had! $14.95 for a gigantic burger and fries in LV is very affordable! To start, they brought out a big ol' biscuit with honey butter sauce on top that was incredible!! Not long after, our burgers came out. Water was always full, we sat down right away, and it was air conditioned!! Definitely recommend!!"

In [46]:
# drop original text column and rename text_fixed to text
reviews_labeled.drop(['text'], axis=1, inplace=True)
reviews_labeled.rename(columns={'text_fixed': 'text'}, inplace=True)

In [4]:
# reviews_labeled.to_feather('../data/reviews_labeled_no_contractions.feather')
reviews_labeled = pd.read_feather('../data/reviews_labeled_no_contractions.feather')

In [6]:
reviews_labeled = pd.read_pickle('../data/reviews_labeled_no_contractions.pkl')

In [8]:
reviews_labeled.tail()

Unnamed: 0,is_positive,text
135997,0,This used to be one of the better places for a...
135998,0,Ridiculous place. I am shocked this chain stay...
135999,0,Do not waste your time nor your money here.. T...
136000,0,Hard to be the king of burgers if you do not h...
136001,0,Terrible and Slow Service. Declining Food Qual...


In [16]:
# randomly shuffle the dataframe
reviews_labeled = reviews_labeled.sample(frac=1).reset_index(drop=True)

In [18]:
reviews_labeled.tail()

Unnamed: 0,is_positive,text
135997,1,This place is amazing!! From the atmosphere to...
135998,0,I am shocked this place is averaging as high a...
135999,0,Was looking for a breakfast place so we decide...
136000,1,"As Boulder City residents, we visit Coffee Cup..."
136001,1,"The wait is worth it! Friendly service,,, tons..."


In [19]:
pd.to_pickle(reviews_labeled, '../data/reviews_labeled_no_contractions_shuffled.pkl')

# Split Data into Training and Test Sets

At this point, we can perform an `80-20` split of the labeled dataset into training and test sets.  

In [20]:
train, test = train_test_split(reviews_labeled, test_size=0.2)

In [21]:
train.shape

(108801, 2)

In [22]:
test.shape

(27201, 2)

In [23]:
text_train = train.text.values.tolist()
sentiment_train = train.is_positive.values.reshape(len(train), 1)
# One-hot encode the target labels
onehot_encoder = OneHotEncoder(sparse=False)
sentiment_train = onehot_encoder.fit_transform(sentiment_train)
del train

In [24]:
text_test = test.text.values.tolist()
sentiment_test = test.is_positive.values.reshape(len(test), 1)
# One-hot encode the target labels
sentiment_test = onehot_encoder.transform(sentiment_test)
del test

# Tokenization

We use the built-in tokenizer from Keras to convert each review currently represented as a single text string, into a list of word tokens.

This blog provides a good explanation of the process:

http://www.developintelligence.com/blog/2017/06/practical-neural-networks-keras-classifying-yelp-reviews/

The crux is that Keras' tokenizer performs a 2-step process:

**Step 1**: Split text strings (reviews) into their constituent words. We specify the character to be used for splitting sentences; in this case, it will be the space character.

**Step 2**: Take all words split out from the sentences and rank them in the decreasing order of their counts.  So, the most common word will be ranked 1.

**Step 3**: Represent each word by its rank found in Step 2.  Here, we move from a string representation of each word to an integer representation.

Note that we also specify the maximum number of words we want to include in our vocabulary.  If we ask for our vocabulary size to be `n` words, then only the `n` most common words are included; the rest are removed from each reviw.


## Fitting and Transforming using the Tokenizer
The Keras tokenizer API performs two common preprocessing steps - lowercasing all words and removing punctuations - when it is fitted onto the training set.  Using the fitted tokenizer, we can convert any new text string into an equivalent list of tokens using the `texts_to_sequences` method.  Such a transformation of new text into tokens only uses the vocabulary known to the tokenizer.  Out-of-vocabulary words are ignored during the transformation of text to tokens.

**Important note:**

The `Tokenizer` starts assigning ids/indices to each word starting from `1`.  There is no word with id `0`.  This must be kept in mind when using the integer id for any word to index into a tensor/array of embeddings.

In [25]:
vocab_size = 10000    # the maximum size of our vocabulary
# import the built-in tokenizer from Keras
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, 
                                                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                                                  lower=True, split=' ')

In [26]:
# fit tokenizer on reviews from the training set
tokenizer.fit_on_texts(text_train)
# convert text into sequences of integer tokens
text_train_intseq = tokenizer.texts_to_sequences(text_train)

In [28]:
# A sample showing the transformation of text_train into text_train_intseq
print('Text review: \n', text_train[0])
print('\nInteger sequence representation: \n', text_train_intseq[0])

Text review: 
 Cute and sad

The first time I went there the latte was really bitter, burnt tasting, but it is such an adorable place.  
I went back last week as I was close by and it is so cute and cool ( I love the building and the quirkiness).

Well, I am sad to say, this time the latte was bitter and had a burnt taste.

I am so sorry to say cannot go back. Quite a shame because I know they can make it work,  before they lose  too many customers. 
Btw: my friend had iced tea and said it was fine.

Integer sequence representation: 
 [835, 2, 740, 1, 111, 44, 3, 94, 36, 1, 1434, 6, 63, 2308, 809, 726, 21, 9, 7, 341, 68, 2875, 25, 3, 94, 41, 168, 391, 38, 3, 6, 381, 86, 2, 9, 7, 27, 835, 2, 456, 3, 82, 1, 1365, 2, 1, 93, 3, 60, 740, 5, 147, 15, 44, 1, 1434, 6, 2308, 2, 24, 4, 809, 170, 3, 60, 27, 567, 5, 147, 128, 48, 41, 383, 4, 1128, 79, 3, 148, 19, 80, 122, 9, 298, 146, 19, 2771, 81, 182, 374, 2009, 17, 309, 24, 1158, 400, 2, 166, 9, 6, 558]


In [29]:
word2index = tokenizer.word_index

`word2index` is a dict mapping each word (string) to its unique index (rank) assigned by the tokenizer.  For example, the id/index associated with the word `apple` can be found by:

In [30]:
word2index['apple']   # index assigned to the word 'apple'

1341

Next, we transform the reviews in the test data using the tokenizer fitted on the training reviews:

In [31]:
text_test_intseq = tokenizer.texts_to_sequences(text_test)

## Pre-trained Word Embeddings

We will use **GloVe** embeddings to represent words in the text.  Using pre-trained embeddings to represent words in a neural network involves the following steps:

1. Load the GloVe embeddings file. The file has a word on each line, followed by its embedding vector representation on the same line.
2. Only read lines corresponding to words in our pre-selected vocabulary.  In this case, we decided to restrict our vocabulary to the `10,000` most common words; hence, we will only read in word embeddings for words in `word2index` with index <= `10,000`.
3. The word embeddings we read in step 2 will be added to an embedding array in the row corresponding to that word's index number found from `word2index`.  (Minor detail: we'll subtract `1` from the index to get the correct row number in the embedding array because integer index assignment in the tokenizer starts from `1`, not `0`).

The GloVe embeddings can be downloaded from here:
https://nlp.stanford.edu/projects/glove/

In [32]:
emb_dim = 50    # we use 50-dimensional GloVe embeddings
emb = np.zeros((vocab_size, emb_dim))

with open('../data/glove.6B.50d.txt', 'r') as f:
    for line in f:
        content = line.strip().split(' ')
        word = content[0]                                    # string representation
        if word in word2index:
            index = word2index[word]           # tokenizer index corresponding to word
            if index <= vocab_size:
                emb_word = np.asarray(content[1:], dtype='float32')  # numerical embedding
                # subtract 1 from index because tokenizer indexing started from 1
                emb[index - 1, :] = emb_word

In [33]:
emb.shape   # (number of words in vocabulary) x (embedding size)

(10000, 50)

#  Sequence Length

We need to calculate the length of each review, i.e. the number of tokens in each review.  This step is required for dealing with variable length sequences (since our reviews do not have the same length).

In [34]:
review_len_train = []
for review in text_train_intseq:
    review_len_train.append(len(review))
    
review_len_train = np.array(review_len_train)

In [35]:
max(review_len_train)

118

In [36]:
min(review_len_train)    # we have some empty reviews

0

In [37]:
# create boolean mask to filter out empty reviews
to_keep_ind = [True if review_len_train[i] > 0 else False for i in range(len(review_len_train))]

# apply filtering mask to all relevant arrays
review_len_train = review_len_train[to_keep_ind]
sentiment_train = sentiment_train[to_keep_ind, :]
text_train_intseq = np.array(text_train_intseq)[to_keep_ind].tolist()

In [38]:
min(review_len_train)    # we have some empty reviews

1

In [39]:
review_len_test = []
for review in text_test_intseq:
    review_len_test.append(len(review))
    
review_len_test = np.array(review_len_test)

In [40]:
max(review_len_test)

111

In [41]:
min(review_len_test)

0

In [42]:
# create boolean mask to filter out empty reviews
to_keep_ind = [True if review_len_test[i] > 0 else False for i in range(len(review_len_test))]

# apply filtering mask to all relevant arrays
review_len_test = review_len_test[to_keep_ind]
sentiment_test = sentiment_test[to_keep_ind, :]
text_test_intseq = np.array(text_test_intseq)[to_keep_ind].tolist()

In [43]:
min(review_len_test)

1

# RNN Model

## Set up Constants

In [44]:
NUM_EPOCHS = 3
BATCH_SIZE = 128
LEARNING_RATE = 0.001
NUM_OUTPUTS = 2      # output, positive or negative label
NUM_NEURONS = 64      # number of neurons in each layer
NUM_LAYERS = 1       # number of stacked layers of recurrent units
DROPOUT_PROB = 0.3   # probability of dropout

input_len = len(text_train_intseq)

# TRIAL

In [45]:
tf.reset_default_graph()

# Placeholders
# X = tf.placeholder(tf.int32, [None, None])      # num of reviews x num of tokens per review (variable)
# y = tf.placeholder(tf.float32, [None, NUM_OUTPUTS])
# seq_len = tf.placeholder(tf.int32, [None])   # for holding the length of each review
# print('X: ', X.get_shape())

# Set up data batching
input_sequence = zip(text_train_intseq, sentiment_train, review_len_train)

def generator():
    while True:
        for el in input_sequence:
            yield tuple((np.array(el[0]), el[1], el[2]))

dataset = tf.data.Dataset().from_generator(generator, output_types=(tf.int32, tf.int32, tf.int32))
# dataset = dataset.shuffle(buffer_size=input_len)     # dataset small enough to hold in memory
# shapes = (tf.TensorShape(None), tf.TensorShape(2), tf.TensorShape(None))
# shapes = ([None], [2], [1])
# shapes = (tf.TensorShape([None]), tf.TensorShape([2]), tf.TensorShape([1]))
shapes = (tf.TensorShape([None]), tf.TensorShape([2]), tf.TensorShape(()))
dataset = dataset.padded_batch(BATCH_SIZE, padded_shapes=shapes)  # batch-level padding
dataset = dataset.repeat()
iter = dataset.make_initializable_iterator()
# Get word_ids (word_ids_batch), labels (y_batch), and sequence length for each batch
word_ids_batch, y_batch, seq_len_batch = iter.get_next()


batch_size = tf.shape(word_ids_batch)[0]
word_ids_batch = tf.reshape(word_ids_batch, [batch_size, -1])
y_batch = tf.reshape(y_batch, [batch_size, 2])
seq_len_batch = tf.reshape(seq_len_batch, [-1])   # need a flattened list

print('word_ids_batch: ', word_ids_batch.get_shape())
print('y_batch: ', y_batch.get_shape())
print('seq_len_batch: ', seq_len_batch.get_shape())

# In each batch, take word_ids and look up corresponding word vector X_emb_batch
word_embeddings = tf.constant(emb, dtype=tf.float32)
X_emb_batch = tf.nn.embedding_lookup(word_embeddings, word_ids_batch)

# BUILD MODEL
dropout = tf.placeholder_with_default(0.0, shape=())  # allow applying dropout only while training

cells = []   # Create a stacked/multi-layered network
for _ in range(NUM_LAYERS):
    cell = tf.contrib.rnn.BasicLSTMCell(num_units=NUM_NEURONS)
    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=(1.0 - dropout))
    cells.append(cell)

cell = tf.contrib.rnn.MultiRNNCell(cells)

# Perform dynamic unrolling of the network.  The batched data flows thru this unrolled network.
print('X_emb_batch: ', X_emb_batch.get_shape())
outputs, _ = tf.nn.dynamic_rnn(cell, X_emb_batch, sequence_length=seq_len_batch, dtype=tf.float32)
# We're only interested in the last time step; so slice outputs
outputs_last_step = outputs[:, -1, :]
logits = tf.contrib.layers.fully_connected(outputs_last_step, NUM_OUTPUTS, activation_fn=None)
predictions = tf.nn.softmax(logits)

# Define loss and optimizer
loss = tf.losses.softmax_cross_entropy(y_batch, logits)
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
train_op = optimizer.minimize(loss)

word_ids_batch:  (?, ?)
y_batch:  (?, 2)
seq_len_batch:  (?,)
X_emb_batch:  (?, ?, 50)


In [None]:
# Training the network
n_batches = input_len // BATCH_SIZE

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # initialize iterator with train data
    sess.run(iter.initializer)
    print('Training...')
    for i in range(NUM_EPOCHS):
        tot_loss = 0
        for _ in range(n_batches):
            _, loss_value = sess.run([train_op, loss])
            tot_loss += loss_value
        print("Iter: {}, Loss: {:.4f}".format(i, tot_loss / n_batches))
    # initialise iterator with test data
    sess.run(iter.initializer)
    print('Test Loss: {:4f}'.format(sess.run(loss)))

Training...





# Other things to try
1. Effect of punctuations, particularly, ! and ?.  Right now, we've ignored all punctuations

In [None]:
# need to confirm seq_len and that we're getting the last relevant output for loss calc (see Danijar's blog)
# look at review len and get rid of very small reviews

# what can impact slow training/low core usage?
# -- while True?  Try removing it and check