# Using Doc2Vec to classify movie reviews

## 1. Import packages

In [1]:
from tensorflow.contrib.tensorboard.plugins import projector
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.callbacks import CallbackAny2Vec
import tensorflow as tf
import multiprocessing
import numpy as np
import os

## 2. Download the IMDB dataset

In [2]:
imdb = tf.keras.datasets.imdb
(train_reviews, train_labels), (test_reviews, test_labels) = imdb.load_data()

In [3]:
train_reviews[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [4]:
train_labels[0]

1

## 3. Create a function to decode reviews

In [5]:
vocab = imdb.get_word_index()
vocab = {k:(v + 3) for k, v in vocab.items()}
vocab["<PAD>"] = 0
vocab["<START>"] = 1
vocab["<UNK>"] = 2
vocab["<UNUSED>"] = 3

In [6]:
vocab["brilliant"]

530

In [7]:
vocab_inv =  dict([(value, key) for (key, value) in vocab.items()])

In [8]:
vocab_inv[1048]

'incredible'

In [9]:
def decode_review(review):
    return [vocab_inv.get(i, "?") for i in review]

In [10]:
decode_review(train_reviews[0])

['<START>',
 'this',
 'film',
 'was',
 'just',
 'brilliant',
 'casting',
 'location',
 'scenery',
 'story',
 'direction',
 "everyone's",
 'really',
 'suited',
 'the',
 'part',
 'they',
 'played',
 'and',
 'you',
 'could',
 'just',
 'imagine',
 'being',
 'there',
 'robert',
 "redford's",
 'is',
 'an',
 'amazing',
 'actor',
 'and',
 'now',
 'the',
 'same',
 'being',
 'director',
 "norman's",
 'father',
 'came',
 'from',
 'the',
 'same',
 'scottish',
 'island',
 'as',
 'myself',
 'so',
 'i',
 'loved',
 'the',
 'fact',
 'there',
 'was',
 'a',
 'real',
 'connection',
 'with',
 'this',
 'film',
 'the',
 'witty',
 'remarks',
 'throughout',
 'the',
 'film',
 'were',
 'great',
 'it',
 'was',
 'just',
 'brilliant',
 'so',
 'much',
 'that',
 'i',
 'bought',
 'the',
 'film',
 'as',
 'soon',
 'as',
 'it',
 'was',
 'released',
 'for',
 'retail',
 'and',
 'would',
 'recommend',
 'it',
 'to',
 'everyone',
 'to',
 'watch',
 'and',
 'the',
 'fly',
 'fishing',
 'was',
 'amazing',
 'really',
 'cried',
 'a

## 4. Learn embeddings for reviews

In [11]:
reviews = np.concatenate((train_reviews, test_reviews))
docs = [TaggedDocument(decode_review(review), [i]) for i, review in enumerate(reviews)]

In [12]:
class Doc2VecCallback(CallbackAny2Vec):
    def __init__(self, epochs):
        self.prog_bar = tf.keras.utils.Progbar(epochs)
        self.epoch = 0
    def on_epoch_end(self, model):
        self.epoch += 1
        self.prog_bar.update(self.epoch)

In [13]:
d2v_model = Doc2Vec(docs, dm=0, min_count=2, vector_size=100, hs=0, negative=5, epochs=100,
                    callbacks=[Doc2VecCallback(100)], sample=0, workers=multiprocessing.cpu_count())



In [14]:
embdgs = d2v_model.docvecs.vectors_docs
train_embdgs, test_embdgs = np.split(embdgs, [25000])

In [15]:
train_embdgs[0]

array([ 0.08079641,  0.20569757,  0.4738193 ,  0.23749965,  0.06664906,
        1.2267363 , -0.70511824,  0.48151103, -0.55024695, -0.14436685,
       -0.23059061,  0.7129091 , -0.60188824,  0.5016063 ,  0.18376477,
       -0.5230938 ,  0.16004896, -0.18659687,  0.8274295 ,  0.04011085,
        0.03508369,  0.29871807,  0.12340536, -0.55743134,  0.06399595,
       -0.5479066 , -0.89346504, -0.615669  , -0.05332805,  0.28452045,
       -0.08361472, -0.82962734,  1.2487692 , -0.8348145 , -1.3827287 ,
       -0.32844827, -0.05866596, -0.20214   ,  0.8929514 , -0.50951415,
       -0.42142662,  0.2502974 , -0.5526857 , -0.01847663, -0.5334354 ,
       -0.44521442,  0.00903169,  0.09517114, -0.06399161,  0.21078157,
       -0.44145957,  0.79780304,  0.708781  ,  0.52510357,  0.6052623 ,
        0.14815222, -0.5089591 ,  0.20163493, -1.6821849 , -0.6525678 ,
       -0.20529775, -0.34921286, -0.91900027, -0.4330489 , -0.20630024,
        0.02228682, -1.0429921 ,  0.07120833,  0.13347925, -0.16

## 5. Visualize embeddings

In [16]:
embdgs_dir = "embdgs"
os.mkdir(embdgs_dir)
metadata_path = os.path.join(embdgs_dir, "metadata.tsv")
embdgs_path = os.path.join(embdgs_dir, "embdgs.ckpt")

In [17]:
with open(metadata_path, "w", encoding="utf-8") as f:
    f.write("review")
    for review in reviews:
        excerpt = " ".join(decode_review(review[1:31]))
        f.write(f"{excerpt}\n")

In [18]:
sess = tf.InteractiveSession()
tf.get_variable("embdgs", initializer=embdgs)
writer = tf.summary.FileWriter(embdgs_dir)
saver = tf.train.Saver()

Instructions for updating:
Colocations handled automatically by placer.


In [19]:
config = projector.ProjectorConfig()
embdg_conf = config.embeddings.add()
embdg_conf.tensor_name = "embdgs"
embdg_conf.metadata_path = "metadata.tsv"
projector.visualize_embeddings(writer, config)

In [20]:
sess.run(tf.global_variables_initializer())
saver.save(sess, embdgs_path)
sess.close()

## 6. Classify reviews

In [21]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
model.compile(optimizer=tf.train.AdamOptimizer(0.01), loss='binary_crossentropy',
              metrics=['accuracy'])

In [22]:
model.fit(train_embdgs, train_labels, batch_size=64, epochs=50, shuffle=True)

Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x212aa3d15c0>

In [23]:
model.evaluate(test_embdgs, test_labels)



[0.2794467719936371, 0.88748]