In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import contractions
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


In [3]:
tf.__version__

'1.8.0'

# Prepare Input Data

In [None]:
reviews_labeled = pd.read_feather('../data/reviews_labeled.feather')

In [19]:
reviews_labeled.shape

(136002, 2)

Note that we have an equal number of positive and negative labeled reviews:

In [37]:
reviews_labeled.is_positive.value_counts()

1    68001
0    68001
Name: is_positive, dtype: int64

In [20]:
reviews_labeled.iloc[0]        # sample record

text           This is nothing like Chipotle, the food taste ...
is_positive                                                    1
Name: 0, dtype: object

In [21]:
reviews_labeled.text.iloc[0]   # sample review text

"This is nothing like Chipotle, the food taste way better, the quality of the food is great. This is the perfect example of eating in a moms pops restaurant. The atmosphere is awesome, the service is great. If you are looking for good Mexican food this is the place to go, you will not be disappointed. I would go out of my way to eat here that's for sure. I ordered the veggie bowl, since I am vegetarian, best bowl ever. It last me two days."

We need to break down the text from reviews from its present form, which is a string, to an ordered list of words.  These words are 'features' of each review, and must later be converted into a numerical representation that the neural network can work with.

## Replace Contractions

This can be considered to be an optional step.  This replaces common word contractions - e.g. `doesn't` is replaced by `does not`.  We use a module called `contractions` (easily installed via `pip install contractions`) that maintains a list of common English contractions and their corresponding expanded versions.

Here's a sample usage of the `contractions` module:


In [24]:
sentence = "He doesn't know how they've done the job.  I won't allow it."
print('Original sentence: ', sentence)
print('Fixed sentence:    ', contractions.fix(sentence))

Original sentence:  He doesn't know how they've done the job.  I won't allow it.
Fixed sentence:     He does not know how they have done the job.  I will not allow it.


In [22]:
# create a new column with text that has contractions expanded
reviews_labeled['text_fixed'] = reviews_labeled.text.apply(contractions.fix)

Here's a sample comparison between an original review and the version with no contractions.  Words like `don't`, `couldn't`, `I've` have been expanded to their full forms.

In [38]:
reviews_labeled.text.iloc[3]

"I don't normally give five stars unless everything was PERFECT, but I truly couldn't find a single thing to complain about! Service was great, burgers were huge and one of the best I've ever had! $14.95 for a gigantic burger and fries in LV is very affordable! To start, they brought out a big ol' biscuit with honey butter sauce on top that was incredible!! Not long after, our burgers came out. Water was always full, we sat down right away, and it was air conditioned!! Definitely recommend!!"

In [39]:
reviews_labeled.text_fixed.iloc[3]

"I do not normally give five stars unless everything was PERFECT, but I truly could not find a single thing to complain about! Service was great, burgers we are huge and one of the best I have ever had! $14.95 for a gigantic burger and fries in LV is very affordable! To start, they brought out a big ol' biscuit with honey butter sauce on top that was incredible!! Not long after, our burgers came out. Water was always full, we sat down right away, and it was air conditioned!! Definitely recommend!!"

In [46]:
# drop original text column and rename text_fixed to text
reviews_labeled.drop(['text'], axis=1, inplace=True)
reviews_labeled.rename(columns={'text_fixed': 'text'}, inplace=True)

In [4]:
# reviews_labeled.to_feather('../data/reviews_labeled_no_contractions.feather')
reviews_labeled = pd.read_feather('../data/reviews_labeled_no_contractions.feather')

# Split Data into Training and Test Sets

At this point, we can perform an `80-20` split of the labeled dataset into training and test sets.  

In [5]:
train, test = train_test_split(reviews_labeled, test_size=0.2)

In [6]:
train.shape

(108801, 2)

In [7]:
test.shape

(27201, 2)

In [8]:
text_train = train.text.values.tolist()
sentiment_train = train.is_positive.values.reshape(len(train), 1)
# One-hot encode the target labels
onehot_encoder = OneHotEncoder(sparse=False)
sentiment_train = onehot_encoder.fit_transform(sentiment_train)
del train

In [None]:
text_test = test.text.values.tolist()
sentiment_test = test.is_positive.values.reshape(len(test), 1)
# One-hot encode the target labels
sentiment_test = onehot_encoder.transform(sentiment_test)
del test

# Tokenization

We use the built-in tokenizer from Keras to convert each review currently represented as a single text string, into a list of word tokens.

This blog provides a good explanation of the process:

http://www.developintelligence.com/blog/2017/06/practical-neural-networks-keras-classifying-yelp-reviews/

The crux is that Keras' tokenizer performs a 2-step process:

**Step 1**: Split text strings (reviews) into their constituent words. We specify the character to be used for splitting sentences; in this case, it will be the space character.

**Step 2**: Take all words split out from the sentences and rank them in the decreasing order of their counts.  So, the most common word will be ranked 1.

**Step 3**: Represent each word by its rank found in Step 2.  Here, we move from a string representation of each word to an integer representation.

Note that we also specify the maximum number of words we want to include in our vocabulary.  If we ask for our vocabulary size to be `n` words, then only the `n` most common words are included; the rest are removed from each reviw.


## Fitting and Transforming using the Tokenizer
The Keras tokenizer API performs two common preprocessing steps - lowercasing all words and removing punctuations - when it is fitted onto the training set.  Using the fitted tokenizer, we can convert any new text string into an equivalent list of tokens using the `texts_to_sequences` method.  Such a transformation of new text into tokens only uses the vocabulary known to the tokenizer.  Out-of-vocabulary words are ignored during the transformation of text to tokens.

**Important note:**

The `Tokenizer` starts assigning ids/indices to each word starting from `1`.  There is no word with id `0`.  This must be kept in mind when using the integer id for any word to index into a tensor/array of embeddings.

In [87]:
vocab_size = 10000    # the maximum size of our vocabulary
# import the built-in tokenizer from Keras
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, 
                                                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                                                  lower=True, split=' ')

In [88]:
# fit tokenizer on reviews from the training set
tokenizer.fit_on_texts(text_train)
# convert text into sequences of integer tokens
text_train_intseq = tokenizer.texts_to_sequences(text_train)

In [89]:
# A sample showing the transformation of text_train into text_train_intseq
print('Text review: \n', text_train[0])
print('\nInteger sequence representation: \n', text_train_intseq[0])

Text review: 
 They have a good kids menu but my nanny went here today for lunch and found hair in both her meal and in my daughter's food. That is enough for me to decide not to go there. They refunded her money but it grosses me out.  I feel very disappointed as this place is cool and right by the splash pad which makes it very convenient. Hair in one plate is accidental but hair in two plates is just unsanitary. Hair me once shame on you, hair me twice shame on me!

Integer sequence representation: 
 [19, 18, 4, 28, 431, 96, 21, 17, 95, 32, 316, 13, 123, 2, 289, 953, 14, 187, 196, 136, 2, 14, 17, 3751, 16, 23, 7, 259, 13, 52, 5, 1433, 10, 5, 48, 36, 19, 5333, 196, 282, 21, 9, 52, 43, 3, 279, 35, 184, 37, 15, 25, 7, 453, 2, 162, 88, 1, 6102, 813, 76, 415, 9, 35, 1387, 953, 14, 50, 373, 7, 21, 953, 14, 140, 682, 7, 40, 4729, 953, 52, 288, 1164, 26, 20, 953, 52, 437, 1164, 26, 52]


In [97]:
word2index = tokenizer.word_index

`word2index` is a dict mapping each word (string) to its unique index (rank) assigned by the tokenizer.  For example, the id/index associated with the word `apple` can be found by:

In [98]:
word2index['apple']   # index assigned to the word 'apple'

1316

Next, we transform the reviews in the test data using the tokenizer fitted on the training reviews:

In [None]:
text_test_intseq = tokenizer.texts_to_sequences(text_test)

## Pre-trained Word Embeddings

We will use **GloVe** embeddings to represent words in the text.  Using pre-trained embeddings to represent words in a neural network involves the following steps:

1. Load the GloVe embeddings file. The file has a word on each line, followed by its embedding vector representation on the same line.
2. Only read lines corresponding to words in our pre-selected vocabulary.  In this case, we decided to restrict our vocabulary to the `10,000` most common words; hence, we will only read in word embeddings for words in `word2index` with index <= `10,000`.
3. The word embeddings we read in step 2 will be added to an embedding array in the row corresponding to that word's index number found from `word2index`.  (Minor detail: we'll subtract `1` from the index to get the correct row number in the embedding array because integer index assignment in the tokenizer starts from `1`, not `0`).

The GloVe embeddings can be downloaded from here:
https://nlp.stanford.edu/projects/glove/

In [100]:
emb_dim = 50    # we use 50-dimensional GloVe embeddings
emb = np.zeros((vocab_size, emb_dim))

with open('../data/glove.6B.50d.txt', 'r') as f:
    for line in f:
        content = line.strip().split(' ')
        word = content[0]                                    # string representation
        if word in word2index:
            index = word2index[word]           # tokenizer index corresponding to word
            if index <= vocab_size:
                emb_word = np.asarray(content[1:], dtype='float32')  # numerical embedding
                # subtract 1 from index because tokenizer indexing started from 1
                emb[index - 1, :] = emb_word

In [101]:
emb.shape   # (number of words in vocabulary) x (embedding size)

(10000, 50)

#  Sequence Length

We need to calculate the length of each review, i.e. the number of tokens in each review.  This step is required for dealing with variable length sequences (since our reviews do not have the same length).

In [None]:
review_len = []
for review in text_train_intseq:
    review_len.append(len(review))
    
review_len = np.array(review_len)

# RNN Model

## Set up Constants

In [126]:
len(text_train_intseq)

108801

In [127]:
text_train_intseq[0]

[19,
 18,
 4,
 28,
 431,
 96,
 21,
 17,
 95,
 32,
 316,
 13,
 123,
 2,
 289,
 953,
 14,
 187,
 196,
 136,
 2,
 14,
 17,
 3751,
 16,
 23,
 7,
 259,
 13,
 52,
 5,
 1433,
 10,
 5,
 48,
 36,
 19,
 5333,
 196,
 282,
 21,
 9,
 52,
 43,
 3,
 279,
 35,
 184,
 37,
 15,
 25,
 7,
 453,
 2,
 162,
 88,
 1,
 6102,
 813,
 76,
 415,
 9,
 35,
 1387,
 953,
 14,
 50,
 373,
 7,
 21,
 953,
 14,
 140,
 682,
 7,
 40,
 4729,
 953,
 52,
 288,
 1164,
 26,
 20,
 953,
 52,
 437,
 1164,
 26,
 52]

In [None]:
# TO DO
# 1. padding!!!

In [115]:
tf.reset_default_graph()

In [26]:
NUM_EPOCHS = 3
BATCH_SIZE = 64
LEARNING_RATE = 0.001
NUM_OUTPUTS = 2      # output, positive or negative label
NUM_NEURONS = 64      # number of neurons in each layer
NUM_LAYERS = 1       # number of stacked layers of recurrent units
DROPOUT_PROB = 0.3   # probability of dropout

In [27]:
# Placeholders
X = tf.placeholder(tf.int32, [BATCH_SIZE, None])      # num of reviews x num of tokens per review (variable)
y = tf.placeholder(tf.float32, [BATCH_SIZE, NUM_OUTPUTS])

In [None]:
seq_len = tf.placeholder(tf.int32, [BATCH_SIZE])   # for holding the length of each review

# Set up data batching
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.shuffle(buffer_size=len(text_train_intseq))     # dataset small enough to hold in memory
dataset = dataset.padded_batch(BATCH_SIZE, padded_shapes=([1, None], [1, 2]))  # batch-level padding
dataset = dataset.repeat()
iter = dataset.make_initializable_iterator()
# Get word_ids and target labels corresponding to X and y provided to dataset for each batch
word_ids, targets = iter.get_next()

# In each batch, take word_ids and look up corresponding word vector X_emb
word_embeddings = tf.constant(emb)
X_emb = tf.nn.embedding_lookup(word_embeddings, word_ids)

# BUILD MODEL
dropout = tf.placeholder_with_default(0.0, shape=())  # allow applying dropout only while training

cells = []   # Create a stacked/multi-layered network
for _ in range(NUM_LAYERS):
    cell = tf.contrib.rnn.BasicLSTMCell(num_units=NUM_NEURONS)
    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=(1.0 - dropout))
    cells.append(cell)

# Perform dynamic unrolling of the network.  The batched data flows thru this unrolled network.
outputs, _ = tf.nn.dynamic_rnn(cell, X_emb, sequence_length=seq_len, dtype=tf.float32)
# We're only interested in the last time step; so slice outputs
outputs_last_step = outputs[:, -1, :]
logits = tf.contrib.layers.fully_connected(outputs_last_step, NUM_OUTPUTS, activation_fn=None)
predictions = tf.nn.softmax(logits)

# Define loss and optimizer
loss = tf.losses.softmax_cross_entropy(targets, logits)
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
train_op = optimizer.minimize(loss)

In [None]:
# Training the network
n_batches = len(text_train_intseq) // BATCH_SIZE

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # initialize iterator with train data
    sess.run(iter.initializer, feed_dict={X: text_train_intseq, y: sentiment_train, batch_size: BATCH_SIZE})
    print('Training...')
    for i in range(NUM_EPOCHS):
        tot_loss = 0
        for _ in range(n_batches):
            _, loss_value = sess.run([train_op, loss])
            tot_loss += loss_value
        print("Iter: {}, Loss: {:.4f}".format(i, tot_loss / n_batches))
    # initialise iterator with test data
    sess.run(iter.initializer, feed_dict={X: text_test_intseq, y: sentiment_test, batch_size: len(text_test_intseq)})
    print('Test Loss: {:4f}'.format(sess.run(loss)))

In [23]:
10//3

3

In [None]:
tf.train.batch()

In [None]:
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 100000



NUM_INPUTS = 1                 # number of channels/features
                
num_time_steps = len(X_train)  # number of steps to be unrolled at a time during truncated BPTT

learning_rate = 0.001 
num_epochs = 5000
num_layers = 2
dropout_prob = 0.3             # probability of dropout

In [119]:
# dataset = tf.data.Dataset.range(100)
# dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))
dataset = dataset.padded_batch(4, padded_shapes=[None])

In [None]:
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

In [122]:
dataset

<PaddedBatchDataset shapes: (?, ?), types: tf.int64>

In [72]:
vocab_size = 4    # how big we want our vocabulary to be
# import the built-in tokenizer from Keras
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, 
                                                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                                                  lower=True, split=' ')

In [73]:
toytexts = ["Is is a common word", "So is the", "the is common", "discombobulation is not common", 
            "? ? ? ? this ^%&% ^%&% ^%&% ^%&% ^%&% make sense 898 1 1 1 1 1 1"]
tokenizer.fit_on_texts(toytexts)

In [74]:
from itertools import chain

In [75]:
len(list(chain.from_iterable([sent.split(' ') for sent in toytexts])))

34

In [76]:
sequences = tokenizer.texts_to_sequences(toytexts)

In [77]:
sequences

[[2, 2, 3], [2], [2, 3], [2, 3], [1, 1, 1, 1, 1, 1]]

In [78]:
print(tokenizer.word_index)

{'1': 1, 'is': 2, 'common': 3, 'the': 4, 'a': 5, 'word': 6, 'so': 7, 'discombobulation': 8, 'not': 9, 'this': 10, 'make': 11, 'sense': 12, '898': 13, 'UNK': 14}


In [79]:
len(tokenizer.word_index)

14

In [84]:
res = tokenizer.texts_to_sequences(["this is a new sentence!", "Is it though ?", "COMMon!!! *&%* WORD", 'blah', '898'])

In [85]:
len(res)

5

In [86]:
res

[[2, 14, 14], [2, 14, 14], [3], [14], []]

In [83]:
tokenizer.word_counts

OrderedDict([('is', 5),
             ('a', 1),
             ('common', 3),
             ('word', 1),
             ('so', 1),
             ('the', 2),
             ('discombobulation', 1),
             ('not', 1),
             ('this', 1),
             ('make', 1),
             ('sense', 1),
             ('898', 1),
             ('1', 6)])

In [114]:
temp = {}
with open('../data/glove.6B.50d.txt', 'r') as f:
    for line in f:
        content = line.strip().split(' ')
        word = content[0]
        vec = content[1:]
        temp[word] = vec

In [9]:
len(word2id)

400000

In [10]:
len(word2emb)

400000

In [131]:
word2id["!"]

805

In [49]:
word2id["wont"]

58544

In [51]:
word2id["Word"]

KeyError: 'Word'

In [18]:
word2id['189087867']

KeyError: '189087867'

In [17]:
word2id = {}
word2emb = {}

with open('../data/sample.txt', 'r') as f:
    for ind, line in enumerate(f):
        content = line.strip().split(' ')
        word = content[0]
        emb = content[1:]
        word2id[word] = ind
        word2emb[word] = [float(val) for val in emb]

In [18]:
word2id

{'"': 8,
 "'s": 9,
 ',': 1,
 '.': 2,
 'a': 7,
 'and': 5,
 'in': 6,
 'of': 3,
 'the': 0,
 'to': 4}

In [22]:
len(word2emb['"'])

50

# Other things to try
1. Effect of punctuations, particularly, ! and ?.  Right now, we've ignored all punctuations

In [None]:

# CONVERT OUTPUT TO list of binary tuples [0,1], [1,0], ...

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
integer_encoded = [0,0,0,1,1,0,1,0,1,1]
integer_encoded = np.array(integer_encoded).reshape(len(integer_encoded), 1)
onehot_encoder = OneHotEncoder(sparse=False)
y_temp = onehot_encoder.fit_transform(integer_encoded)

In [17]:
y_temp

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [18]:
integer_encoded

array([[0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1]])

In [20]:
new_array = np.array([1,0,0,0,1]).reshape(5, 1)
onehot_encoder.transform(new_array)

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.]])