In [1]:
import pandas as pd
import numpy as np
from random import shuffle
import pickle
import re
from collections import defaultdict
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle as sk_shuffle
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional,\
                                     concatenate, Flatten, dot

Using TensorFlow backend.


In [2]:
df = pd.read_csv("../data/wiki_movie_plots_deduped.csv")

In [3]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


**Model Architecture**

<img src="img/model_architecture.jpg" width=400 align="left">

In [4]:
def tokenize_text(x):
    x = x.lower()
    s_tokens =  sent_tokenize(x)
    tokens = [TreebankWordTokenizer().tokenize(s) for s in s_tokens]
    tokens = [[w for w in s if re.match("[A-Za-z]", w) is not None] for s in tokens]
    return tokens

In [5]:
# Tokenize each movie plot into a list of sentences, each containing a list of tokens
# Return tuple containing (list of tokens, movie index)
X_plots = []
for i, p in enumerate(df["Plot"]):
    sentences = tokenize_text(p)
    for s in sentences:
        X_plots.append((s, i))

In [7]:
# Separate plots from movie index
x, _ = zip(*X_plots)

In [8]:
wordvec = Word2Vec(x, size=200, iter=10)
wordvec.save("../models/word2vec")

In [9]:
wordvec.wv.most_similar("monster")

[('creature', 0.8121660947799683),
 ('demon', 0.7041882276535034),
 ('beast', 0.6551673412322998),
 ('monstrous', 0.6548070907592773),
 ('sphere', 0.6453431248664856),
 ('dinosaur', 0.6338607668876648),
 ('mummy', 0.6314750909805298),
 ('werewolf', 0.6305549740791321),
 ('whale', 0.6272330284118652),
 ('giant', 0.6252624988555908)]

In [10]:
wordvec.wv.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.6824626922607422),
 ('princess', 0.6508470177650452),
 ('countess', 0.5387574434280396),
 ('emperor', 0.5184895992279053),
 ('empress', 0.5171738266944885),
 ('prince', 0.49930691719055176),
 ('consort', 0.49101826548576355),
 ('goddess', 0.4869394898414612),
 ('dowager', 0.4809301197528839),
 ('clementianna', 0.47523051500320435)]

In [11]:
# Create a dictionary of indices for all terms in vocab
word2index = {"<UNK>": 0}
for i, k in enumerate(wordvec.wv.index2word):
    word2index[k] = i + 1
    
word_vectors = np.zeros((1, wordvec.wv.vectors.shape[1]))
word_vectors = np.concatenate([word_vectors, wordvec.wv.vectors], axis=0)

In [12]:
print("Vocabulary Size: %i" %len(word2index))
print("Embedding matrix shape: %s" %str(word_vectors.shape))

Vocabulary Size: 59607
Embedding matrix shape: (59607, 200)


In [13]:
def index_lookup(x):
    try:
        return word2index[x]
    except KeyError:
        return word2index["<UNK>"]

In [14]:
X_plots_ind = [([index_lookup(x) for x in sentence], i) for sentence, i in X_plots]
X_plots_ind = np.array(X_plots_ind)

create generator for training neural network

In [15]:
def generator(samples, batch_size = 32, n_neg = 5, max_len = 100):
    num_samples = samples.shape[0]
    
    ind = np.arange(num_samples)

    while True:
        samples = sk_shuffle(samples)
        for offset in range(0, num_samples, batch_size):
            
            X_sent = []
            X_movie = []
            y_out = [] 
            
            # Sample positive examples
            batch_samples = samples[offset:offset + batch_size]
            
            batch_n = len(batch_samples)  # Size of current batch
            
            sentences = batch_samples[:, 0]
            movie_indices = batch_samples[:, 1]
            X_sent.extend(sentences)
            X_movie.extend(movie_indices)
            y_out.extend([1]*batch_n)
            
            # Sample negative examples
            keep_indx =  np.random.choice(ind, batch_n*n_neg, replace=False)
            neg_samples = samples[keep_indx]
            sentences = neg_samples[:, 0]
            movie_indices = np.repeat(movie_indices, n_neg)
            X_sent.extend(sentences)
            X_movie.extend(movie_indices)
            y_out.extend([0]*(batch_n*n_neg))
             
            # Pad zeros
            X_sent = pad_sequences(X_sent, maxlen=max_len)
             
            yield [X_sent.reshape(-1, max_len), np.array(X_movie).reshape(-1, 1)], np.array(y_out)

In [16]:
BATCH_SIZE = 256
NEG_SAMPLE = 5
train_generator = generator(X_plots_ind, batch_size = BATCH_SIZE, n_neg=NEG_SAMPLE)

In [17]:
X, y = next(train_generator)

X[0].shape, X[1].shape, y.shape

((1536, 100), (1536, 1), (1536,))

In [18]:
MAX_LEN = 100

# Define query embedding layer
d1, d2 = word_vectors.shape

# Define query embedding layer
query_embedding = Embedding(d1, d2, 
                            input_length = MAX_LEN,
                            weights = [word_vectors],
                            trainable = False)


# Define movie embedding layer
movie_embedding = Embedding(df.shape[0], d2,
                            input_length = 1,
                            trainable = True)

In [19]:
# Define Model
in1 = Input(shape=(MAX_LEN,))
in2 = Input(shape=(1,))

q = query_embedding(in1)
q = LSTM(50)(q)
q = Dense(500)(q)
q = Dense(200)(q)

m = movie_embedding(in2)
m = Flatten()(m)

c = dot([q, m], axes = 1, normalize=True) # Cosine similarity

out = Dense(1, activation="sigmoid")(c)

model = Model([in1, in2], out)

query_out = Model([in1], q)  # Generates query embedding

In [20]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 200)     11921400    input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 50)           50200       embedding_1[0][0]                
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
dense_1 (D

In [21]:
STEPS_PER_EPOCH = X_plots_ind.shape[0] // BATCH_SIZE
NUM_EPOCHS = 50

model.compile(optimizer = "adam", loss = "binary_crossentropy")
model.fit_generator(train_generator, steps_per_epoch=STEPS_PER_EPOCH, 
                    epochs = NUM_EPOCHS)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2edcb88cf28>

In [22]:
# Save models
model.save("../models/keras_model.hdf5")
query_out.save("../models/query_embedding.hdf5")

In [23]:
movie_vectors = movie_embedding.get_weights()[0]
np.savetxt('../models/movie_embeddings.tsv', movie_vectors, delimiter='\t')

In [24]:
titles = df[["Title", "Release Year"]].apply(lambda x: "%s (%i)" %(x[0], x[1]), axis=1)
meta = pd.DataFrame({"Title": titles, "Genre": df["Genre"], "Director": df["Director"]})
meta.to_csv("../models/movie_embeddings_meta.tsv", sep="\t", index=False)