In [1]:
import pandas as pd
import numpy as np
from random import shuffle
import re
from collections import defaultdict
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle as sk_shuffle
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from gensim.models import word2vec
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv("../data/wiki_movie_plots_deduped.csv")

** TODO: **
1. Create tf-idf vectorization of documents
2. Create generator
    * Sample N documents
    * Sample keywords (1 to n_max) from each document (selection prob. based on tf-idf)
    * Sample k negative documents for each set of search terms
    
3. Model architecture
    * Embedding layer for document (trained during model training)
    * Embedding layer for keywords (use pre-trained word2vec vectors)
    * Concatenate embeddings
    * Output layer of 1 unit w/ binary crossentropy
    


**Model Architecture**

<img src="img/model_architecture.jpg" width=400 align="left">

In [3]:
def tokenize_text(x):
    x = x.lower()
    s_tokens =  sent_tokenize(x)
    tokens = [TreebankWordTokenizer().tokenize(s) for s in s_tokens]
    tokens = [[w for w in s if re.match("[A-Za-z]", w) is not None] for s in tokens]
    return tokens

In [4]:
# Tokenize each movie plot into a list of sentences, each containing a list of tokens
# Return tuple containing (list of tokens, movie index)
X_plots = []
for i, p in enumerate(df["Plot"]):
    sentences = tokenize_text(p)
    for s in sentences:
        X_plots.append((s, i))

In [5]:
# Separate plots from movie index
x, _ = zip(*X_plots)

In [6]:
# Train word2vec model
wv_model = word2vec.Word2Vec(x, size=200)
wv_model.save("../models/word2vec.model")

In [7]:
wv_model.wv.most_similar("monster")

[('creature', 0.8966418504714966),
 ('beast', 0.8422876596450806),
 ('demon', 0.7798564434051514),
 ('giant', 0.7623004913330078),
 ('mummy', 0.7556557655334473),
 ('werewolf', 0.7538514733314514),
 ('entity', 0.7503894567489624),
 ('sphere', 0.7459837794303894),
 ('alien', 0.7420704364776611),
 ('wolf', 0.7223528623580933)]

In [8]:
wv_model.wv.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.7693840861320496),
 ('princess', 0.7362165451049805),
 ('empress', 0.6411824226379395),
 ('prince', 0.6059414148330688),
 ('countess', 0.591460645198822),
 ('goddess', 0.5856000781059265),
 ('emperor', 0.5829663872718811),
 ('count', 0.5658907890319824),
 ('crown', 0.5383313298225403),
 ('caliph', 0.5350780487060547)]

In [9]:
# Create a dictionary of indices for all terms in vocab
word2index = {"<UNK>": 0}
for i, k in enumerate(wv_model.wv.index2word):
    word2index[k] = i + 1
    
embedding = np.zeros((1, wv_model.wv.vectors.shape[1]))
embedding = np.concatenate([embedding, wv_model.wv.vectors], axis=0)

In [10]:
print("Vocabulary Size: %i" %len(word2index))
print("Embedding matrix shape: %s" %str(embedding.shape))

Vocabulary Size: 64131
Embedding matrix shape: (64131, 200)


In [11]:
def index_lookup(x):
    try:
        return word2index[x]
    except KeyError:
        return word2index["<UNK>"]

In [12]:
X_plots_ind = [([index_lookup(x) for x in sentence], i) for sentence, i in X_plots]
X_plots_ind = np.array(X_plots_ind)

create generator for training neural network

In [13]:
def generator(samples, batch_size = 32, n_neg = 5, max_len = 100):
    num_samples = samples.shape[0]
    
    ind = np.arange(num_samples)

    while True:
        samples = sk_shuffle(samples)
        for offset in range(0, num_samples, batch_size):
            
            X_sent = []
            X_movie = []
            y_out = [] 
            
            # Sample positive examples
            batch_samples = samples[offset:offset + batch_size]
            sentences = batch_samples[:, 0]
            movie_indices = batch_samples[:, 1]
            X_sent.extend(sentences)
            X_movie.extend(movie_indices)
            y_out.extend([1]*batch_size)
            
            # Sample negative examples
            keep_indx =  np.random.choice(ind, batch_size*n_neg, replace=False)
            neg_samples = samples[keep_indx]
            sentences = neg_samples[:, 0]
            movie_indices = np.repeat(movie_indices, n_neg)
            X_sent.extend(sentences)
            X_movie.extend(movie_indices)
            y_out.extend([0]*(batch_size*n_neg))
             
            # Pad zeros
            X_sent = pad_sequences(X_sent, maxlen=max_len)
             
            yield sk_shuffle(X_sent.reshape(-1, max_len, 1), np.array(X_movie), np.array(y_out))

In [14]:
t = generator(X_plots_ind)

In [15]:
X_sent, X_movies, y = next(t)

X_sent.shape, X_movies.shape, y.shape

((192, 100, 1), (192,), (192,))