In [1]:
import numpy as np
import pandas as pd
import spacy
import tensorflow as tf
import contractions

  from ._conv import register_converters as _register_converters


In [2]:
tf.__version__

'1.8.0'

# Prepare Input Data

In [11]:
reviews_labeled = pd.read_feather('../data/reviews_labeled.feather')

In [19]:
reviews_labeled.shape

(136002, 2)

In [20]:
reviews_labeled.iloc[0]        # sample record

text           This is nothing like Chipotle, the food taste ...
is_positive                                                    1
Name: 0, dtype: object

In [21]:
reviews_labeled.text.iloc[0]   # sample review text

"This is nothing like Chipotle, the food taste way better, the quality of the food is great. This is the perfect example of eating in a moms pops restaurant. The atmosphere is awesome, the service is great. If you are looking for good Mexican food this is the place to go, you will not be disappointed. I would go out of my way to eat here that's for sure. I ordered the veggie bowl, since I am vegetarian, best bowl ever. It last me two days."

We need to break down the text from reviews from its present form, which is a string, to an ordered list of words.  These words are 'features' of each review, and must later be converted into a numerical representation that the neural network can work with.

## Replace Contractions

This can be considered to be an optional step.  This replaces common word contractions - e.g. `doesn't` is replaced by `does not`.  We use a module called `contractions` (easily installed via `pip install contractions`) that maintains a list of common English contractions and their corresponding expanded versions.

Here's a sample usage of the `contractions` module:


In [24]:
sentence = "He doesn't know how they've done the job.  I won't allow it."
print('Original sentence: ', sentence)
print('Fixed sentence:    ', contractions.fix(sentence))

Original sentence:  He doesn't know how they've done the job.  I won't allow it.
Fixed sentence:     He does not know how they have done the job.  I will not allow it.


In [22]:
# create a new column with text that has contractions expanded
reviews_labeled['text_fixed'] = reviews_labeled.text.apply(contractions.fix)

Here's a sample comparison between an original review and the version with no contractions.  Words like `don't`, `couldn't`, `I've` have been expanded to their full forms.

In [38]:
reviews_labeled.text.iloc[3]

"I don't normally give five stars unless everything was PERFECT, but I truly couldn't find a single thing to complain about! Service was great, burgers were huge and one of the best I've ever had! $14.95 for a gigantic burger and fries in LV is very affordable! To start, they brought out a big ol' biscuit with honey butter sauce on top that was incredible!! Not long after, our burgers came out. Water was always full, we sat down right away, and it was air conditioned!! Definitely recommend!!"

In [39]:
reviews_labeled.text_fixed.iloc[3]

"I do not normally give five stars unless everything was PERFECT, but I truly could not find a single thing to complain about! Service was great, burgers we are huge and one of the best I have ever had! $14.95 for a gigantic burger and fries in LV is very affordable! To start, they brought out a big ol' biscuit with honey butter sauce on top that was incredible!! Not long after, our burgers came out. Water was always full, we sat down right away, and it was air conditioned!! Definitely recommend!!"

In [46]:
# drop original text column and rename text_fixed to text
reviews_labeled.drop(['text'], axis=1, inplace=True)
reviews_labeled.rename(columns={'text_fixed': 'text'}, inplace=True)
reviews_labeled.to_feather('../data/reviews_labeled_no_contractions.feather')

In [38]:
def expand_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)




In [None]:
from multiprocessing import Pool
def bigramModel(review):
    return bigram_model[review]
p = Pool(24)
processed_bigram_reviews = p.map(bigramModel,tokenized_text)

In [7]:
# using spacy, clean up reviews - no punctuations, \n, etc. and get tokenized version.
nlp = spacy.load('en')

In [20]:
nlp

<spacy.lang.en.English at 0x1160bcbe0>

In [29]:
doc = nlp("he doesn't know how they've done it.  I won't eat there.")
[token.text for token in doc]

['he',
 'does',
 "n't",
 'know',
 'how',
 'they',
 "'ve",
 'done',
 'it',
 '.',
 ' ',
 'I',
 'wo',
 "n't",
 'eat',
 'there',
 '.']

In [37]:
import contractions

In [36]:
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/bc/5a/918ec4c572801d817fc59a8a4adc1b8c3942213364877e847ee0956ab554/contractions-0.0.13-py2.py3-none-any.whl
[31mkaggle-cli 0.12.13 has requirement lxml<4.1,>=4.0.0, but you'll have lxml 4.1.0 which is incompatible.[0m
Installing collected packages: contractions
Successfully installed contractions-0.0.13


In [39]:
sample

'he does not know how they have done it.  I will not eat there.'

## Pre-trained Word Embeddings

We will use GloVe embeddings to represent words in the text.  Using pre-trained embeddings to represent words in a neural network involves 2 steps:

1. Load the GloVe embeddings file. The file has a word on each line, followed by its embedding vector representation on the same line.
2. Map each word in the GloVe file to its id, which is simply the line number of that word.  Save this mapping in `word2id`.
3. Map each word in the GloVe file to its embedding vector.  Save this mapping in `word2emb`.
4. For each review, we have the ordered list of words present in the review.  Use `word2id` to convert the list of words for each review into a list of integer word ids.  Since these ids correspond to the line number in the GloVe embeddings file, we can use them to perform a lookup of each word's embedding vector using TensorFlow's `embedding_lookup` function.

In [8]:
word2id = {}
word2emb = {}

with open('../data/glove.6B.50d.txt', 'r') as f:
    for ind, line in enumerate(f):
        content = line.strip().split(' ')
        word = content[0]
        emb = content[1:]
        word2id[word] = ind
        word2emb[word] = [float(val) for val in emb]

In [9]:
len(word2id)

400000

In [10]:
len(word2emb)

400000

In [34]:
word2id["wont"]

58544

In [32]:
word2id["wo"]

1369

In [18]:
word2id['189087867']

KeyError: '189087867'

In [17]:
word2id = {}
word2emb = {}

with open('../data/sample.txt', 'r') as f:
    for ind, line in enumerate(f):
        content = line.strip().split(' ')
        word = content[0]
        emb = content[1:]
        word2id[word] = ind
        word2emb[word] = [float(val) for val in emb]

In [18]:
word2id

{'"': 8,
 "'s": 9,
 ',': 1,
 '.': 2,
 'a': 7,
 'and': 5,
 'in': 6,
 'of': 3,
 'the': 0,
 'to': 4}

In [22]:
len(word2emb['"'])

50