# Deep Learning - Word2Vec

In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


## Warnings
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

In [3]:
df = pd.DataFrame()
df = pd.read_csv('data_capstone_2/nlp_reviews_cleaned.csv', delimiter=',', encoding='utf-8')

df['rating'] = df['rating'].apply(lambda x: 0 if x <= 2 else 1)
print(df.rating.value_counts())

df = df.drop(['Unnamed: 0', 'customer', 'product', 'time', 'pos_feedback',
       'neg_feedback'], axis=1)

df.head(3)

1    12080
0     1192
Name: rating, dtype: int64


Unnamed: 0,rating,review_text,clean_text
0,1,Great Hoses Good USA company that stands behin...,great hose good usa company stand behind produ...
1,1,Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch ...,gilmour ply flexogen hose inch foot green high...
2,1,Very satisfied! It's probably one of the best ...,satisfied probably one best hose ever pro good...


In [4]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

review_lines = list()
lines = df['review_text'].values.tolist()

for line in lines:   
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word    
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words    
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)

In [5]:
len(review_lines)

13272

In [6]:
review_lines

[['great',
  'hoses',
  'good',
  'usa',
  'company',
  'stands',
  'behind',
  'products',
  'warranty',
  'two',
  'hoses',
  'send',
  'replacements',
  'right',
  'one',
  'burst',
  'awhile',
  'could',
  'see',
  'buldge',
  'weeks',
  'went',
  'suprises',
  'one',
  'winter',
  'related',
  'bad',
  'leave',
  'time',
  'highly',
  'reccomend',
  'note',
  'hundred',
  'footer',
  'heavy',
  'like',
  'wresting',
  'anaconda',
  'time',
  'put',
  'away',
  'far',
  'reach'],
 ['gilmour',
  'flexogen',
  'hose',
  'green',
  'high',
  'quality',
  'ply',
  'hose',
  'good',
  'luck',
  'gilmour',
  'hoses',
  'past',
  'good',
  'choice',
  'hoses'],
 ['satisfied',
  'probably',
  'one',
  'best',
  'hoses',
  'ever',
  'hadpro',
  'good',
  'enough',
  'front',
  'yards',
  'small',
  'back',
  'yards',
  'enough',
  'flow',
  'right',
  'head',
  'water',
  'plants',
  'away',
  'nozzle',
  'water',
  'garden',
  'light',
  'weight',
  'flexible',
  'pose',
  'much',
  'probl

In [7]:
import gensim 

EMBEDDING_DIM = 100
# train word2vec model
model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)
# vocab size
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))

Vocabulary size: 44864


In [8]:
# save model in ASCII (word2vec) format
filename = 'patio_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [9]:
# let us try some utility functions of gensim word2vec more details here 

model.wv.most_similar('garden')#, topn =1)

[('vegetable', 0.8768568634986877),
 ('bed', 0.8264282941818237),
 ('treasures', 0.8245724439620972),
 ('depotwell', 0.8244673013687134),
 ('gardens', 0.8243735432624817),
 ('beds', 0.8168197870254517),
 ('cultivate', 0.8070510625839233),
 ('potted', 0.8069136142730713),
 ('diedyou', 0.8049212694168091),
 ('flower', 0.7979527711868286)]

In [10]:
#Let’s see the result of semantically reasonable word vectors (king - man + woman)
model.wv.most_similar_cosmul(positive=['garden', 'lawn'], negative=['hose'])

[('lawnthis', 1.3675940036773682),
 ('trees', 1.3167675733566284),
 ('wateringnow', 1.313320279121399),
 ('agave', 1.31088387966156),
 ('gardens', 1.2859127521514893),
 ('providedi', 1.2753938436508179),
 ('divots', 1.265055537223816),
 ('bushes', 1.2624788284301758),
 ('shrubs', 1.2610939741134644),
 ('weeds', 1.2534018754959106)]

In [11]:
#Let’s see the result of semantically reasonable word vectors (king - man + woman)
model.wv.most_similar(positive=['garden', 'lawn'], negative=['hose'])

[('gardens', 0.783897340297699),
 ('bushes', 0.7665226459503174),
 ('shrubs', 0.7641263008117676),
 ('trees', 0.763062596321106),
 ('lawnthis', 0.7563513517379761),
 ('weeds', 0.7559521198272705),
 ('agave', 0.7450824975967407),
 ('downed', 0.724432110786438),
 ('backyard', 0.7189232110977173),
 ('growing', 0.7179781198501587)]

In [12]:
#odd word out
print(model.wv.doesnt_match("garden king lawn hose".split()))

hose


In [13]:
model.wv.similar_by_word("garden")

[('vegetable', 0.8768568634986877),
 ('bed', 0.8264282941818237),
 ('treasures', 0.8245724439620972),
 ('depotwell', 0.8244673013687134),
 ('gardens', 0.8243735432624817),
 ('beds', 0.8168197870254517),
 ('cultivate', 0.8070510625839233),
 ('potted', 0.8069136142730713),
 ('diedyou', 0.8049212694168091),
 ('flower', 0.7979527711868286)]

In [14]:
print(model.similarity('garden', 'lawn'))

0.5844543


In [15]:
import os

embeddings_index = {}
f = open(os.path.join('', 'patio_embedding_word2vec.txt'),  encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()


In [16]:
X_train = df.loc[:10000, 'review_text'].values
y_train = df.loc[:10000, 'rating'].values
X_test = df.loc[10001:, 'review_text'].values
y_test = df.loc[10001:, 'rating'].values

In [17]:
total_reviews = df.review_text.values
max_length = max([len(s.split()) for s in total_reviews])

In [18]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

VALIDATION_SPLIT = 0.2

# vectorize the text samples into a 2D integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(review_lines)
sequences = tokenizer_obj.texts_to_sequences(review_lines)

# pad sequences
word_index = tokenizer_obj.word_index
print('Found %s unique tokens.' % len(word_index))

review_pad = pad_sequences(sequences, maxlen=max_length)
sentiment =  df['rating'].values
print('Shape of review tensor:', review_pad.shape)
print('Shape of sentiment tensor:', sentiment.shape)

# split the data into a training set and a validation set
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

Found 44864 unique tokens.
Shape of review tensor: (13272, 2263)
Shape of sentiment tensor: (13272,)


In [19]:
print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_test_pad.shape)
print('Shape of y_test tensor:', y_test.shape)

Shape of X_train_pad tensor: (10618, 2263)
Shape of y_train tensor: (10618,)
Shape of X_test_pad tensor: (2654, 2263)
Shape of y_test tensor: (2654,)


In [20]:
EMBEDDING_DIM =100
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [21]:
print(num_words)

44865


In [22]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.initializers import Constant

# define model
model = Sequential()
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)

model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit the model
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2263, 100)         4486500   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2259, 128)         64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1129, 128)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 144512)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 144513    
Total params: 4,695,141
Trainable params: 208,641
Non-trainable params: 4,486,500
_________________________________________________________________
None
Train on 10618 samples, validate on 2654 samples
Epoch 1/25
 - 241s - loss: 0.3013 - acc: 0.9058 - val_loss: 0.2555 - val

<keras.callbacks.History at 0x1a256d67f0>

In [23]:
# evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test, batch_size=128)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.826677


In [24]:
#Let us test some  samples
# load the dataset but only keep the top n words, zero the rest

test_sample_1 = "This garden is beautiful!"
test_sample_2 = "Good hose!"
test_sample_3 = "Maybe I like this lawnmover."
test_sample_4 = "Not to my taste, will skip and use another hose"
test_sample_5 = "if you like fruits, then this product might be good for you."
test_sample_6 = "Bad rubber!"
test_sample_7 = "Not a good vegetable!"
test_sample_8 = "This hose really sucks! Can I get my money back please?"
test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5, test_sample_6, test_sample_7, test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=2263)

#predict
model.predict(x=test_samples_tokens_pad)

array([[0.97867787],
       [0.99898213],
       [0.40239662],
       [0.9764854 ],
       [0.7732555 ],
       [0.79503137],
       [0.99898213],
       [0.15101512]], dtype=float32)

In [25]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

# define model
model = Sequential()
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)
model.add(embedding_layer)
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 2263, 100)         4486500   
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 4,499,301
Trainable params: 12,801
Non-trainable params: 4,486,500
_________________________________________________________________
None


In [26]:
print('Train...')

model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train...
Train on 10618 samples, validate on 2654 samples
Epoch 1/25
 - 379s - loss: 0.3215 - acc: 0.9053 - val_loss: 0.2845 - val_acc: 0.9145
Epoch 2/25
 - 365s - loss: 0.2904 - acc: 0.9090 - val_loss: 0.2739 - val_acc: 0.9145
Epoch 3/25
 - 365s - loss: 0.2811 - acc: 0.9094 - val_loss: 0.2616 - val_acc: 0.9145
Epoch 4/25
 - 400s - loss: 0.2686 - acc: 0.9090 - val_loss: 0.2516 - val_acc: 0.9137
Epoch 5/25
 - 380s - loss: 0.2614 - acc: 0.9106 - val_loss: 0.2488 - val_acc: 0.9148
Epoch 6/25
 - 383s - loss: 0.2555 - acc: 0.9088 - val_loss: 0.2445 - val_acc: 0.9148
Epoch 7/25
 - 383s - loss: 0.2482 - acc: 0.9100 - val_loss: 0.2489 - val_acc: 0.9137
Epoch 8/25
 - 370s - loss: 0.2429 - acc: 0.9120 - val_loss: 0.2405 - val_acc: 0.9175
Epoch 9/25
 - 358s - loss: 0.2396 - acc: 0.9115 - val_loss: 0.2377 - val_acc: 0.9171
Epoch 10/25
 - 377s - loss: 0.2370 - acc: 0.9126 - val_loss: 0.2366 - val_acc: 0.9167
Epoch 11/25
 - 358s - loss: 0.2336 - acc: 0.9148 - val_loss: 0.2343 - val_acc: 0.9171
Epoch

<keras.callbacks.History at 0x1a3a0afb00>

In [27]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))

Testing...
Test score: 0.22464125733134796
Test accuracy: 0.9167294653628046
Accuracy: 91.67%


In [28]:
y_pred_model = model.predict(X_test_pad)