In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('../input/movie-data/movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


We will use the Gensim implementation of Word2Vec. 

The first step is to prepare the text corpus for learning the embedding by creating word tokens, removing punctuation, removing stop words etc. The word2vec algorithm processes documents sentence by sentence.

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

review_line = []
lines = df.review.values.tolist()

for line in lines:
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    
    # Remove punctuation from each word
    table = str.maketrans('','', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # Remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    
    # Filter out stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_line.append(words)

In [4]:
len(review_line)

50000

We have 50000 review lines in our text corpus. Gensim’s Word2Vec API requires some parameters for initialization.

In [5]:
import gensim
#sentences – List of sentences; here we pass the list of review sentences.

#size – The number of dimensions in which we wish to represent our word. This is the size of the word vector.

#min_count – Word with frequency greater than min_count only are going to be included into the model. Usually, the bigger and more extensive your text, the higher this number can be.

#window – Only terms that occur within a window-neighborhood of a term, in a sentence, are associated with it during training. The usual value is 4 or 5.

#workers– Number of threads used in training parallelization, to speed up training

embedding_dim = 100

model = gensim.models.Word2Vec(sentences=review_line, size=embedding_dim, window=5, workers=4, min_count=1)

# Vocab size
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))

Vocabulary size: 134156


## Test Word2Vec Model
After train the model on IMDB dataset, the vocabulary is 134156.
Test the model by using some words.

In [6]:
model.wv.most_similar('great')

[('excellent', 0.7754549384117126),
 ('wonderful', 0.7711958885192871),
 ('fantastic', 0.7553576231002808),
 ('fine', 0.7369243502616882),
 ('good', 0.7358146905899048),
 ('terrific', 0.7053135633468628),
 ('amazing', 0.6946685910224915),
 ('brilliant', 0.6699872016906738),
 ('incredible', 0.6601674556732178),
 ('awesome', 0.6546913981437683)]

In [7]:
# Let’s see the result of semantically reasonable word vectors (king - man + woman)
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('romeo', 0.8941930532455444),
 ('princess', 0.865547239780426),
 ('samoylova', 0.8610376119613647),
 ('juliet', 0.8575840592384338),
 ('prince', 0.8470630049705505),
 ('ladislaw', 0.8437036871910095),
 ('bride', 0.8431840538978577),
 ('angstingwithaninferioritycomplex', 0.8370692133903503),
 ('tianxia', 0.8325138092041016),
 ('crimecop', 0.8268693685531616)]

In [8]:
# Odd word out
print(model.wv.doesnt_match("woman king queen movie".split()))

movie


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [9]:
print(model.similarity('boy', 'girl'))

0.8440559


  """Entry point for launching an IPython kernel.


### Save model for the later use
The next step is to use the word embeddings directly in the embedding layer in our sentiment classification model. we can save the model to be used later.

In [10]:
# Save model in ASCII (word2vec) format
filename = 'imdb_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

### Use Pre-trained Embedding

In [11]:
import os

embeddings_index = {}
f = open(os.path.join('', 'imdb_embedding_word2vec.txt'),  encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [12]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [13]:
total_reviews = X_train + X_test
max_length = 260 

The next, is to convert the word embedding into tokenized vector. 

Recall that the review documents are integer encoded prior to passing them to the Embedding layer. The integer maps to the index of a specific vector in the embedding layer. Therefore, it is important that we lay the vectors out in the Embedding layer such that the encoded words map to the correct vector.

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Vectorize the text 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_line)
sequence = tokenizer.texts_to_sequences(review_line)

# Pad sequences
word_index = tokenizer.word_index
review_pad = pad_sequences(sequence, maxlen=max_length)
sentiment = df.sentiment.values

print('Found %s unique tokens.' % len(word_index))
print('Shape of review tensor:', review_pad.shape)
print('Shape of sentiment tensor:', sentiment.shape)

Found 134156 unique tokens.
Shape of review tensor: (50000, 260)
Shape of sentiment tensor: (50000,)


In [15]:
# Split data into training and testing
index = np.arange(review_pad.shape[0])
np.random.shuffle(index)
review_pad = review_pad[index]
sentiment = sentiment[index]
vali_samples = int(0.2 * review_pad.shape[0])

X_train_pad = review_pad[:-vali_samples]
y_train = sentiment[:-vali_samples]
X_test_pad = review_pad[-vali_samples:]
y_test = sentiment[-vali_samples:]

print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_test_pad.shape)
print('Shape of y_test tensor:', y_test.shape)

Shape of X_train_pad tensor: (40000, 260)
Shape of y_train tensor: (40000,)
Shape of X_test_pad tensor: (10000, 260)
Shape of y_test tensor: (10000,)


Now map embeddings from the loaded word2vec model for each word to the tokenizer_obj.word_index vocabulary and create a matrix with of word vectors.

In [16]:
EMBEDDING_DIM =100
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

### Build Model
We are now ready with the trained embedding vector to be used directly in the embedding layer.

In [17]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.initializers import Constant

model = Sequential()

# Load pre-trained word embeddings into an Embedding layer
# Note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.summary()

Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 260, 100)          13415700  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 256, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 128, 128)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 16384)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 16385     
Total params: 13,496,213
Trainable params: 80,513
Non-trainable params: 13,415,700
_________________________________________________________________


In [18]:
# Complie model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

Since the model uses pre-trained word embedding it has very few trainable params and hence should train faster.

In [19]:
# Fit model
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train on 40000 samples, validate on 10000 samples
Epoch 1/25
 - 44s - loss: 0.4578 - acc: 0.7852 - val_loss: 0.3574 - val_acc: 0.8460
Epoch 2/25
 - 44s - loss: 0.3036 - acc: 0.8730 - val_loss: 0.3355 - val_acc: 0.8566
Epoch 3/25
 - 44s - loss: 0.2326 - acc: 0.9056 - val_loss: 0.3540 - val_acc: 0.8470
Epoch 4/25
 - 44s - loss: 0.1727 - acc: 0.9355 - val_loss: 0.3710 - val_acc: 0.8482
Epoch 5/25
 - 43s - loss: 0.1153 - acc: 0.9625 - val_loss: 0.4428 - val_acc: 0.8396
Epoch 6/25
 - 43s - loss: 0.0743 - acc: 0.9787 - val_loss: 0.4675 - val_acc: 0.8428
Epoch 7/25
 - 43s - loss: 0.0467 - acc: 0.9915 - val_loss: 0.5203 - val_acc: 0.8419
Epoch 8/25
 - 43s - loss: 0.0263 - acc: 0.9972 - val_loss: 0.5872 - val_acc: 0.8349
Epoch 9/25
 - 44s - loss: 0.0148 - acc: 0.9994 - val_loss: 0.6147 - val_acc: 0.8371
Epoch 10/25
 - 44s - loss: 0.0089 - acc: 0.9999 - val_loss: 0.6438 - val_acc: 0.8397
Epoch 11/25
 - 44s - loss: 0.0059 - acc: 0.9999 - val_loss: 0.6740 - val_acc: 0.8415
Epoch 12/25
 - 44s - los

<keras.callbacks.callbacks.History at 0x7f033643feb8>

In [20]:
# Evaluate model
score, accuracy = model.evaluate(X_test_pad, y_test, batch_size=128)



In [21]:
print("Accuracy: {0:.2%}".format(accuracy))

Accuracy: 83.81%
