In [1]:
import pandas as pd
import numpy as np
from random import shuffle
import pickle
import re
from collections import defaultdict
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle as sk_shuffle
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from gensim.models import word2vec
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional,\
                         concatenate, Flatten

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv("../data/wiki_movie_plots_deduped.csv")

** TODO: **
1. Create tf-idf vectorization of documents
2. Create generator
    * Sample N documents
    * Sample keywords (1 to n_max) from each document (selection prob. based on tf-idf)
    * Sample k negative documents for each set of search terms
    
3. Model architecture
    * Embedding layer for document (trained during model training)
    * Embedding layer for keywords (use pre-trained word2vec vectors)
    * Concatenate embeddings
    * Output layer of 1 unit w/ binary crossentropy
    


**Model Architecture**

<img src="img/model_architecture.jpg" width=400 align="left">

In [3]:
def tokenize_text(x):
    x = x.lower()
    s_tokens =  sent_tokenize(x)
    tokens = [TreebankWordTokenizer().tokenize(s) for s in s_tokens]
    tokens = [[w for w in s if re.match("[A-Za-z]", w) is not None] for s in tokens]
    return tokens

In [4]:
# Tokenize each movie plot into a list of sentences, each containing a list of tokens
# Return tuple containing (list of tokens, movie index)
X_plots = []
for i, p in enumerate(df["Plot"]):
    sentences = tokenize_text(p)
    for s in sentences:
        X_plots.append((s, i))

In [5]:
# Separate plots from movie index
x, _ = zip(*X_plots)

In [6]:
# Train word2vec model
wv_model = word2vec.Word2Vec(x, size=200)
wv_model.save("../models/word2vec.model")

In [7]:
wv_model.wv.most_similar("monster")

[('creature', 0.8760976791381836),
 ('demon', 0.7963001132011414),
 ('beast', 0.7954497337341309),
 ('giant', 0.7572686672210693),
 ('whale', 0.7563628554344177),
 ('alien', 0.7514498233795166),
 ('monstrous', 0.7437711954116821),
 ('werewolf', 0.7436423301696777),
 ('sphere', 0.7419789433479309),
 ('dinosaur', 0.7387843132019043)]

In [8]:
wv_model.wv.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.7539695501327515),
 ('princess', 0.7349125742912292),
 ('empress', 0.6151232719421387),
 ('prince', 0.6134809851646423),
 ('countess', 0.6115014553070068),
 ('emperor', 0.5759602189064026),
 ('goddess', 0.5578703880310059),
 ('caliph', 0.529577374458313),
 ('crown', 0.52640300989151),
 ('count', 0.5248473286628723)]

In [9]:
# Create a dictionary of indices for all terms in vocab
word2index = {"<UNK>": 0}
for i, k in enumerate(wv_model.wv.index2word):
    word2index[k] = i + 1
    
word_vectors = np.zeros((1, wv_model.wv.vectors.shape[1]))
word_vectors = np.concatenate([word_vectors, wv_model.wv.vectors], axis=0)

In [10]:
print("Vocabulary Size: %i" %len(word2index))
print("Embedding matrix shape: %s" %str(word_vectors.shape))

Vocabulary Size: 64131
Embedding matrix shape: (64131, 200)


In [11]:
def index_lookup(x):
    try:
        return word2index[x]
    except KeyError:
        return word2index["<UNK>"]

In [12]:
X_plots_ind = [([index_lookup(x) for x in sentence], i) for sentence, i in X_plots]
X_plots_ind = np.array(X_plots_ind)

create generator for training neural network

In [13]:
def generator(samples, batch_size = 32, n_neg = 5, max_len = 100):
    num_samples = samples.shape[0]
    
    ind = np.arange(num_samples)

    while True:
        samples = sk_shuffle(samples)
        for offset in range(0, num_samples, batch_size):
            
            X_sent = []
            X_movie = []
            y_out = [] 
            
            # Sample positive examples
            batch_samples = samples[offset:offset + batch_size]
            sentences = batch_samples[:, 0]
            movie_indices = batch_samples[:, 1]
            X_sent.extend(sentences)
            X_movie.extend(movie_indices)
            y_out.extend([1]*batch_size)
            
            # Sample negative examples
            keep_indx =  np.random.choice(ind, batch_size*n_neg, replace=False)
            neg_samples = samples[keep_indx]
            sentences = neg_samples[:, 0]
            movie_indices = np.repeat(movie_indices, n_neg)
            X_sent.extend(sentences)
            X_movie.extend(movie_indices)
            y_out.extend([0]*(batch_size*n_neg))
             
            # Pad zeros
            X_sent = pad_sequences(X_sent, maxlen=max_len)
             
            yield (sk_shuffle(X_sent.reshape(-1, max_len), np.array(X_movie).reshape(-1, 1)), np.array(y_out))

In [14]:
train_generator = generator(X_plots_ind)

In [15]:
X, y = next(train_generator)

X[0].shape, X[1].shape, y.shape

((192, 100), (192, 1), (192,))

In [16]:
MAX_LEN = 100

# Define query embedding layer
d1, d2 = word_vectors.shape

query_embedding = Embedding(d1, d2, weights = [word_vectors],
               input_length = MAX_LEN,
               trainable=False)


# Define movie embedding layer
movie_embedding = Embedding(df.shape[0], d2,
                            input_length = 1,
                            trainable = True)

In [17]:
in1 = Input(shape=(MAX_LEN,))
in2 = Input(shape=(1,))

q = query_embedding(in1)
q = LSTM(10)(q)
q = Dense(200, activation="relu")(q)

m = movie_embedding(in2)
m = Flatten()(m)

c = concatenate([q, m])

c = Dense(50, activation = "relu")(c)
out = Dense(1, activation="sigmoid")(c)

model = Model([in1, in2], out)

In [18]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 200)     12826200    input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 10)           8440        embedding_1[0][0]                
__________________________________________________________________________________________________
embedding_

In [19]:
BATCH_SIZE = 512
STEPS_PER_EPOCH = df.shape[0] / BATCH_SIZE
NUM_EPOCHS = 100

model.compile(optimizer = "adam", loss = "binary_crossentropy")
model.fit_generator(train_generator, steps_per_epoch=STEPS_PER_EPOCH, 
                    epochs = NUM_EPOCHS)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x170a86eb8>

In [20]:
model.save("../models/keras_model.hdf5")

In [21]:
movie_vectors = movie_embedding.get_weights()[0]
pickle.dump(movie_vectors, open("../models/movie_vectors.pkl", "wb"))