# Deep Learning - Keras with Word2Vec

## Importing Necessary Libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline

## Warnings
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

In [2]:
df = pd.DataFrame()
df = pd.read_csv('data_capstone_2/nlp_reviews_cleaned.csv', delimiter=',', encoding='utf-8')

df['rating'] = df['rating'].apply(lambda x: 0 if x <= 2 else 1)
print(df.rating.value_counts())

df = df.drop(['Unnamed: 0', 'customer', 'product', 'time', 'pos_feedback',
       'neg_feedback'], axis=1)

df.head(3)

1    12080
0     1192
Name: rating, dtype: int64


Unnamed: 0,rating,review_text,clean_text
0,1,Great Hoses Good USA company that stands behin...,great hose good usa company stand behind produ...
1,1,Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch ...,gilmour ply flexogen hose inch foot green high...
2,1,Very satisfied! It's probably one of the best ...,satisfied probably one best hose ever pro good...


In [3]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

review_lines = list()
lines = df['review_text'].values.tolist()

for line in lines:   
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word    
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words    
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)

In [4]:
len(review_lines)

13272

In [5]:
import gensim 

EMBEDDING_DIM = 100
# train word2vec model
model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)
# vocab size
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))

Vocabulary size: 44864


In [6]:
# save model in ASCII (word2vec) format
filename = 'patio_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [7]:
# let us try some utility functions of gensim word2vec more details here 

model.wv.most_similar('garden')#, topn =1)

[('vegetable', 0.8768383264541626),
 ('beds', 0.8337284326553345),
 ('bed', 0.8252702951431274),
 ('gardens', 0.8229961395263672),
 ('gardenwell', 0.8200172185897827),
 ('tilled', 0.812398374080658),
 ('flower', 0.8111097812652588),
 ('convenientmost', 0.8109019994735718),
 ('potted', 0.8022102117538452),
 ('global', 0.8003907799720764)]

In [8]:
#Let’s see the result of semantically reasonable word vectors (king - man + woman)
model.wv.most_similar_cosmul(positive=['garden', 'lawn'], negative=['hose'])

[('trees', 1.2877529859542847),
 ('asspectracide', 1.2751572132110596),
 ('neardesert', 1.2589832544326782),
 ('sharon', 1.2445369958877563),
 ('bushes', 1.2419459819793701),
 ('weeds', 1.2363262176513672),
 ('graveled', 1.2342287302017212),
 ('gardens', 1.2331007719039917),
 ('shrubs', 1.2270678281784058),
 ('hemlock', 1.222316861152649)]

In [9]:
#Let’s see the result of semantically reasonable word vectors (king - man + woman)
model.wv.most_similar(positive=['garden', 'lawn'], negative=['hose'])

[('bushes', 0.7804819941520691),
 ('gardens', 0.7714396715164185),
 ('shrubs', 0.768234133720398),
 ('weeds', 0.7635079622268677),
 ('trees', 0.7633467316627502),
 ('weed', 0.7562059760093689),
 ('growing', 0.7357085943222046),
 ('asspectracide', 0.7348328828811646),
 ('lowhanging', 0.7326929569244385),
 ('greening', 0.7302519679069519)]

In [10]:
#odd word out
print(model.wv.doesnt_match("garden king lawn hose".split()))

hose


In [11]:
model.wv.similar_by_word("garden")

[('vegetable', 0.8768383264541626),
 ('beds', 0.8337284326553345),
 ('bed', 0.8252702951431274),
 ('gardens', 0.8229961395263672),
 ('gardenwell', 0.8200172185897827),
 ('tilled', 0.812398374080658),
 ('flower', 0.8111097812652588),
 ('convenientmost', 0.8109019994735718),
 ('potted', 0.8022102117538452),
 ('global', 0.8003907799720764)]

In [12]:
print(model.similarity('garden', 'lawn'))

0.5919059


In [13]:
import os

embeddings_index = {}
f = open(os.path.join('', 'patio_embedding_word2vec.txt'),  encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()


In [14]:
X_train = df.loc[:10000, 'review_text'].values
y_train = df.loc[:10000, 'rating'].values
X_test = df.loc[10001:, 'review_text'].values
y_test = df.loc[10001:, 'rating'].values

In [15]:
total_reviews = df.review_text.values
max_length = max([len(s.split()) for s in total_reviews])

In [16]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

VALIDATION_SPLIT = 0.2

# vectorize the text samples into a 2D integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(review_lines)
sequences = tokenizer_obj.texts_to_sequences(review_lines)

# pad sequences
word_index = tokenizer_obj.word_index
print('Found %s unique tokens.' % len(word_index))

review_pad = pad_sequences(sequences, maxlen=max_length)
sentiment =  df['rating'].values
print('Shape of review tensor:', review_pad.shape)
print('Shape of sentiment tensor:', sentiment.shape)

# split the data into a training set and a validation set
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

Found 44864 unique tokens.
Shape of review tensor: (13272, 2263)
Shape of sentiment tensor: (13272,)


In [17]:
print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_test_pad.shape)
print('Shape of y_test tensor:', y_test.shape)

Shape of X_train_pad tensor: (10618, 2263)
Shape of y_train tensor: (10618,)
Shape of X_test_pad tensor: (2654, 2263)
Shape of y_test tensor: (2654,)


In [18]:
EMBEDDING_DIM =100
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [19]:
print(num_words)

44865


In [20]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.initializers import Constant

# define model
model = Sequential()
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)

model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit the model
model.fit(X_train_pad, y_train, batch_size=128, epochs=10, validation_data=(X_test_pad, y_test), verbose=2)

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2263, 100)         4486500   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2259, 128)         64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1129, 128)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 144512)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 144513    
Total params: 4,695,141
Trainable params: 208,641
Non-trainable params: 4,486,500
_________________________________________________________________
None
Train on 10618 samples, validate on 2654 samples
Epoch 1/10
 - 263s - loss: 0.3001 - acc: 0.9023 - val_loss: 0.2724 - val

<keras.callbacks.History at 0x117ddf358>

In [21]:
# evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test, batch_size=128)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.449887


In [22]:
#Let us test some  samples
# load the dataset but only keep the top n words, zero the rest

test_sample_1 = "This garden is beautiful!"
test_sample_2 = "Good hose!"
test_sample_3 = "Maybe I like this lawnmover."
test_sample_4 = "Not to my taste, will skip and use another hose"
test_sample_5 = "if you like fruits, then this product might be good for you."
test_sample_6 = "Bad rubber!"
test_sample_7 = "Not a good vegetable!"
test_sample_8 = "This hose really sucks! Can I get my money back please?"
test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5, test_sample_6, test_sample_7, test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=2263)

#predict
model.predict(x=test_samples_tokens_pad)

array([[0.9321882 ],
       [0.9743523 ],
       [0.4674715 ],
       [0.9235527 ],
       [0.69168115],
       [0.62217313],
       [0.9743523 ],
       [0.82867193]], dtype=float32)

In [23]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

# define model
model = Sequential()
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)
model.add(embedding_layer)
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 2263, 100)         4486500   
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 4,499,301
Trainable params: 12,801
Non-trainable params: 4,486,500
_________________________________________________________________
None


In [24]:
print('Train...')

model.fit(X_train_pad, y_train, batch_size=128, epochs=10, validation_data=(X_test_pad, y_test), verbose=2)

Train...
Train on 10618 samples, validate on 2654 samples
Epoch 1/10
 - 276s - loss: 0.3230 - acc: 0.9033 - val_loss: 0.2871 - val_acc: 0.9130
Epoch 2/10
 - 271s - loss: 0.2884 - acc: 0.9100 - val_loss: 0.2733 - val_acc: 0.9130
Epoch 3/10
 - 308s - loss: 0.2734 - acc: 0.9098 - val_loss: 0.2601 - val_acc: 0.9133
Epoch 4/10
 - 289s - loss: 0.2585 - acc: 0.9094 - val_loss: 0.2563 - val_acc: 0.9145
Epoch 5/10
 - 281s - loss: 0.2478 - acc: 0.9097 - val_loss: 0.2548 - val_acc: 0.9148
Epoch 6/10
 - 281s - loss: 0.2435 - acc: 0.9103 - val_loss: 0.2466 - val_acc: 0.9160
Epoch 7/10
 - 280s - loss: 0.2411 - acc: 0.9121 - val_loss: 0.2447 - val_acc: 0.9167
Epoch 8/10
 - 274s - loss: 0.2370 - acc: 0.9130 - val_loss: 0.2426 - val_acc: 0.9160
Epoch 9/10
 - 280s - loss: 0.2309 - acc: 0.9123 - val_loss: 0.2421 - val_acc: 0.9164
Epoch 10/10
 - 294s - loss: 0.2302 - acc: 0.9116 - val_loss: 0.2420 - val_acc: 0.9175


<keras.callbacks.History at 0x1a438786a0>

In [25]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))

Testing...
Test score: 0.24200316827909876
Test accuracy: 0.9174830450001927
Accuracy: 91.75%


In [33]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_pad)
print (classification_report(y_test, y_pred.round()))

              precision    recall  f1-score   support

           0       0.77      0.07      0.13       231
           1       0.92      1.00      0.96      2423

   micro avg       0.92      0.92      0.92      2654
   macro avg       0.85      0.54      0.55      2654
weighted avg       0.91      0.92      0.89      2654

