In [None]:
# Install additional packages
#!pip install gensim

In [22]:
import pandas as pd
import numpy as np
from random import shuffle
import pickle
import re
from collections import defaultdict
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle as sk_shuffle
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from gensim.models import word2vec
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional,\
                         concatenate, Flatten, dot

In [2]:
df = pd.read_csv("../data/wiki_movie_plots_deduped.csv")

** TODO: **
1. Create tf-idf vectorization of documents
2. Create generator
    * Sample N documents
    * Sample keywords (1 to n_max) from each document (selection prob. based on tf-idf)
    * Sample k negative documents for each set of search terms
    
3. Model architecture
    * Embedding layer for document (trained during model training)
    * Embedding layer for keywords (use pre-trained word2vec vectors)
    * Concatenate embeddings
    * Output layer of 1 unit w/ binary crossentropy
    


**Model Architecture**

<img src="img/model_architecture.jpg" width=400 align="left">

In [3]:
def tokenize_text(x):
    x = x.lower()
    s_tokens =  sent_tokenize(x)
    tokens = [TreebankWordTokenizer().tokenize(s) for s in s_tokens]
    tokens = [[w for w in s if re.match("[A-Za-z]", w) is not None] for s in tokens]
    return tokens

In [4]:
# Tokenize each movie plot into a list of sentences, each containing a list of tokens
# Return tuple containing (list of tokens, movie index)
X_plots = []
for i, p in enumerate(df["Plot"]):
    sentences = tokenize_text(p)
    for s in sentences:
        X_plots.append((s, i))

In [5]:
# Separate plots from movie index
x, _ = zip(*X_plots)

In [6]:
# Train word2vec model
wv_model = word2vec.Word2Vec(x, size=100)
wv_model.save("../models/word2vec.model")

In [7]:
wv_model.wv.most_similar("monster")

[('creature', 0.9068116545677185),
 ('beast', 0.8441286087036133),
 ('werewolf', 0.8003013730049133),
 ('demon', 0.7928799986839294),
 ('giant', 0.7789997458457947),
 ('alien', 0.7764308452606201),
 ('mummy', 0.7652592062950134),
 ('vampire', 0.7628523111343384),
 ('whale', 0.7531956434249878),
 ('sphere', 0.749419093132019)]

In [8]:
wv_model.wv.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.8515051007270813),
 ('princess', 0.8008507490158081),
 ('prince', 0.7408854365348816),
 ('empress', 0.7261396050453186),
 ('countess', 0.7169984579086304),
 ('count', 0.7018733620643616),
 ('goddess', 0.6616137623786926),
 ('emperor', 0.6534422636032104),
 ('crown', 0.6426087021827698),
 ('consort', 0.6280940771102905)]

In [9]:
# Create a dictionary of indices for all terms in vocab
word2index = {"<UNK>": 0}
for i, k in enumerate(wv_model.wv.index2word):
    word2index[k] = i + 1
    
word_vectors = np.zeros((1, wv_model.wv.vectors.shape[1]))
word_vectors = np.concatenate([word_vectors, wv_model.wv.vectors], axis=0)

In [10]:
print("Vocabulary Size: %i" %len(word2index))
print("Embedding matrix shape: %s" %str(word_vectors.shape))

Vocabulary Size: 64131
Embedding matrix shape: (64131, 100)


In [11]:
def index_lookup(x):
    try:
        return word2index[x]
    except KeyError:
        return word2index["<UNK>"]

In [12]:
X_plots_ind = [([index_lookup(x) for x in sentence], i) for sentence, i in X_plots]
X_plots_ind = np.array(X_plots_ind)

create generator for training neural network

In [71]:
def generator(samples, batch_size = 32, n_neg = 5, max_len = 100):
    num_samples = samples.shape[0]
    
    ind = np.arange(num_samples)

    while True:
        samples = sk_shuffle(samples)
        for offset in range(0, num_samples, batch_size):
            
            X_sent = []
            X_movie = []
            y_out = [] 
            
            # Sample positive examples
            batch_samples = samples[offset:offset + batch_size]
            
            batch_n = len(batch_samples)  # Size of current batch
            
            sentences = batch_samples[:, 0]
            movie_indices = batch_samples[:, 1]
            X_sent.extend(sentences)
            X_movie.extend(movie_indices)
            y_out.extend([1]*batch_n)
            
            # Sample negative examples
            
            keep_indx =  np.random.choice(ind, batch_n*n_neg, replace=False)
            neg_samples = samples[keep_indx]
            sentences = neg_samples[:, 0]
            movie_indices = np.repeat(movie_indices, n_neg)
            X_sent.extend(sentences)
            X_movie.extend(movie_indices)
            y_out.extend([0]*(batch_n*n_neg))
             
            # Pad zeros
            X_sent = pad_sequences(X_sent, maxlen=max_len)
             
            yield (sk_shuffle(X_sent.reshape(-1, max_len), np.array(X_movie).reshape(-1, 1)), np.array(y_out))

In [89]:
BATCH_SIZE = 128
NEG_SAMPLE = 5
train_generator = generator(X_plots_ind, batch_size = BATCH_SIZE, n_neg=NEG_SAMPLE)

In [90]:
X, y = next(train_generator)

X[0].shape, X[1].shape, y.shape

((768, 100), (768, 1), (768,))

In [83]:
MAX_LEN = 100

# Define query embedding layer
d1, d2 = word_vectors.shape

query_embedding = Embedding(d1, d2, weights = [word_vectors],
               input_length = MAX_LEN,
               trainable=False)


# Define movie embedding layer
movie_embedding = Embedding(df.shape[0], d2,
                            input_length = 1,
                            trainable = True)

In [84]:
in1 = Input(shape=(MAX_LEN,))
in2 = Input(shape=(1,))

q = query_embedding(in1)
q = Bidirectional(LSTM(20))(q)
q = Dense(100)(q)

m = movie_embedding(in2)
m = Flatten()(m)

c = dot([q, m], axes = 1, normalize=True)
#c = average([q, m])
out = Dense(1, activation="sigmoid")(c)

model = Model([in1, in2], out)

In [85]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_27 (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 100, 100)     6413100     input_27[0][0]                   
__________________________________________________________________________________________________
input_28 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
bidirectional_13 (Bidirectional (None, 40)           19360       embedding_5[0][0]                
__________________________________________________________________________________________________
embedding_

In [91]:
STEPS_PER_EPOCH = np.ceil(X_plots_ind.shape[0] / BATCH_SIZE)
NUM_EPOCHS = 5

model.compile(optimizer = "adam", loss = "binary_crossentropy")
model.fit_generator(train_generator, steps_per_epoch=STEPS_PER_EPOCH, 
                    epochs = NUM_EPOCHS)

Epoch 1/5
 161/5773 [..............................] - ETA: 41:14 - loss: 0.6636

KeyboardInterrupt: 

In [None]:
model.save("../models/keras_model.hdf5")

In [None]:
movie_vectors = movie_embedding.get_weights()[0]
np.save("../models/movie_vectors", movie_vectors)