# Using Doc2Vec to classify movie reviews

## 1. First steps

### 1.1 Import packages

In [1]:
from tensorflow.contrib.tensorboard.plugins import projector
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.callbacks import CallbackAny2Vec
import tensorflow as tf
import multiprocessing
import numpy as np
import os

### 1.2 Download data

In [2]:
imdb = tf.keras.datasets.imdb

In [3]:
(train_reviews, train_labels), (test_reviews, test_labels) = imdb.load_data()

### 1.3 Build `vocab` and `vocab_rev`

In [4]:
vocab = imdb.get_word_index()

In [5]:
vocab = {k:(v + 3) for k, v in vocab.items()}
vocab["<PAD>"] = 0
vocab["<START>"] = 1
vocab["<UNK>"] = 2
vocab["<UNUSED>"] = 3

In [6]:
vocab_rev =  dict([(value, key) for (key, value) in vocab.items()])

### 1.4 Create `decode_review`

In [7]:
def decode_review(review):
    return [vocab_rev.get(i, "?") for i in review]

## 2. Learn embeddings

### 2.1 Prepare docs

In [9]:
reviews = np.concatenate((train_reviews, test_reviews))

In [10]:
docs = [TaggedDocument(decode_review(review), [i]) for i, review in enumerate(reviews)]

### 2.2 Train `Doc2Vec` model

In [12]:
class Doc2VecCallback(CallbackAny2Vec):
    def __init__(self, epochs):
        self.prog_bar = tf.keras.utils.Progbar(epochs)
        self.epoch = 0
    def on_epoch_end(self, model):
        self.epoch += 1
        self.prog_bar.update(self.epoch)

In [13]:
cores = multiprocessing.cpu_count()

In [14]:
d2v_model = Doc2Vec(docs, dm=0, min_count=2, vector_size=100, hs=0, negative=5, epochs=100,
                    callbacks=[Doc2VecCallback(100)], sample=0, workers=cores)



In [16]:
embdgs = d2v_model.docvecs.vectors_docs
train_embdgs, test_embdgs = np.split(embdgs, [25000])

### 2.3 Visualize embeddings

In [17]:
embdgs_dir = "embdgs"

In [18]:
os.mkdir(embdgs_dir)

In [19]:
meta_path = os.path.join(embdgs_dir, "meta.tsv")
embdgs_path = os.path.join(embdgs_dir, "embdgs.ckpt")

In [20]:
with open(meta_path, "w", encoding="utf-8") as f:
    f.write("review")
    for review in reviews:
        excerpt = " ".join(decode_review(review[1:31]))
        f.write(f"{excerpt}\n")

In [21]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
tf.get_variable("embdgs", initializer=embdgs)
writer = tf.summary.FileWriter(embdgs_dir)
saver = tf.train.Saver()

In [22]:
config = projector.ProjectorConfig()
embdg_conf = config.embeddings.add()
embdg_conf.tensor_name = "embdgs"
embdg_conf.metadata_path = "meta.tsv"
projector.visualize_embeddings(writer, config)

In [23]:
sess.run(tf.global_variables_initializer())
saver.save(sess, embdgs_path)
sess.close()

## 3. Classify reviews

### 3.1 Build model

In [36]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
model.compile(optimizer=tf.train.AdamOptimizer(0.01), loss='binary_crossentropy',
              metrics=['accuracy'])

### 3.2 Train model

In [37]:
model.fit(train_embdgs, train_labels, batch_size=64, epochs=50, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1880097f7b8>

### 3.3 Evaluate model

In [35]:
model.evaluate(test_embdgs, test_labels)



[0.26972603229045866, 0.888]