In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
tf.__version__

'1.8.0'

# Prepare Input Data

In [5]:
reviews_labeled = pd.read_feather('../data/reviews_labeled.feather')

In [6]:
reviews_labeled.shape

(183253, 2)

In [7]:
reviews_labeled.iloc[0]        # sample record

text           Schwartz's is a Montreal classic. If you menti...
is_positive                                                    1
Name: 0, dtype: object

In [8]:
reviews_labeled.text.iloc[0]   # sample review text

"Schwartz's is a Montreal classic. If you mention Montreal as a food city, Schwartz's should always pop up. \n\nEvery time I go to Schwartz's I get a medium/fatty smoked meat sandwich with a black cherry cola. Eating here is an experience of its own when the place is busy, expect to sit with people you don't know but just know that we're all enjoying the same great smoked meat that Schwartz's serves up!"

We need to break down the text from reviews from its present form, which is a string, to an ordered list of words.  These words are 'features' of each review, and must later be converted into a numerical representation that the neural network can work with.

In [None]:
# using spacy, clean up reviews - no punctuations, \n, etc. and get tokenized version.


## Pre-trained Word Embeddings

We will use GloVe embeddings to represent words in the text.  Using pre-trained embeddings to represent words in a neural network involves 2 steps:

1. Load the GloVe embeddings file. The file has a word on each line, followed by its embedding vector representation on the same line.
2. Map each word in the GloVe file to its id, which is simply the line number of that word.  Save this mapping in `word2id`.
3. Map each word in the GloVe file to its embedding vector.  Save this mapping in `word2emb`.
4. For each review, we have the ordered list of words present in the review.  Use `word2id` to convert the list of words for each review into a list of integer word ids.  Since these ids correspond to the line number in the GloVe embeddings file, we can use them to perform a lookup of each word's embedding vector using TensorFlow's `embedding_lookup` function.

In [9]:
np.asarray(['1', '2'])

array(['1', '2'],
      dtype='<U1')

In [23]:
word2id = {}
word2emb = {}

with open('../data/glove.6B.50d.txt', 'r') as f:
    for ind, line in enumerate(f):
        content = line.strip().split(' ')
        word = content[0]
        emb = content[1:]
        word2id[word] = ind
        word2emb[word] = [float(val) for val in emb]

In [24]:
len(word2id)

400000

In [25]:
len(word2emb)

400000

In [27]:
word2id['word']

1388

In [17]:
word2id = {}
word2emb = {}

with open('../data/sample.txt', 'r') as f:
    for ind, line in enumerate(f):
        content = line.strip().split(' ')
        word = content[0]
        emb = content[1:]
        word2id[word] = ind
        word2emb[word] = [float(val) for val in emb]

In [18]:
word2id

{'"': 8,
 "'s": 9,
 ',': 1,
 '.': 2,
 'a': 7,
 'and': 5,
 'in': 6,
 'of': 3,
 'the': 0,
 'to': 4}

In [22]:
len(word2emb['"'])

50