In [161]:
import pandas as pd
import numpy as np
from random import shuffle
import re
from collections import defaultdict
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle as sk_shuffle
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from gensim.models import word2vec
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional

In [3]:
df = pd.read_csv("../data/wiki_movie_plots_deduped.csv")

** TODO: **
1. Create tf-idf vectorization of documents
2. Create generator
    * Sample N documents
    * Sample keywords (1 to n_max) from each document (selection prob. based on tf-idf)
    * Sample k negative documents for each set of search terms
    
3. Model architecture
    * Embedding layer for document
    * Embedding layer for keywords
    * Concatenate embeddings
    * Output layer of 1 unit w/ binary crossentropy

In [4]:
def tokenize_text(x):
    x = x.lower()
    s_tokens =  sent_tokenize(x)
    tokens = [TreebankWordTokenizer().tokenize(s) for s in s_tokens]
    tokens = [[w for w in s if re.match("[A-Za-z]", w) is not None] for s in tokens]
    return tokens

class Tokenizer(object):
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()
    def __call__(self, doc):
        return [re.sub("[^A-Za-z]", "", w.lower()) for w in self.tokenizer.tokenize(doc) 
                if re.sub("[^A-Za-z]", "", w) != '']

In [289]:
# Tokenize each movie plot into a list of sentences, each containing a list of tokens
# Return tuple containing (list of tokens, movie index)
X_plots = []
for i, p in enumerate(df["Plot"]):
    sentences = tokenize_text(p)
    for s in sentences:
        X_plots.append((s, i))

In [521]:
# Separate plots from movie index
x, _ = zip(*X_plots)

In [303]:
# Train word2vec model
wv_model = word2vec.Word2Vec(x, size=200)

In [304]:
wv_model.wv.most_similar("monster")

[('creature', 0.8925873637199402),
 ('beast', 0.7964785099029541),
 ('demon', 0.7758796215057373),
 ('monstrous', 0.7614763975143433),
 ('giant', 0.7488950490951538),
 ('werewolf', 0.7399545907974243),
 ('alien', 0.7360496520996094),
 ('whale', 0.7253955602645874),
 ('sphere', 0.7226647734642029),
 ('vampire', 0.7220095992088318)]

In [305]:
wv_model.wv.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.7663202285766602),
 ('princess', 0.7231383323669434),
 ('empress', 0.6324298977851868),
 ('countess', 0.5969557166099548),
 ('prince', 0.5929268598556519),
 ('emperor', 0.5760850310325623),
 ('goddess', 0.5467470288276672),
 ('consort', 0.5342263579368591),
 ('count', 0.5263806581497192),
 ('crown', 0.5165952444076538)]

In [306]:
# Create a dictionary of indices for all terms in vocab
word2index = {"<UNK>": 0}
for i, k in enumerate(wv_model.wv.index2word):
    word2index[k] = i + 1
    
embedding = np.zeros((1, wv_model.wv.vectors.shape[1]))
embedding = np.concatenate([embedding, wv_model.wv.vectors], axis=0)

In [307]:
print("Vocabulary Size: %i" %len(word2index))
print("Embedding matrix shape: %s" %str(embedding.shape))

Vocabulary Size: 64131
Embedding matrix shape: (64131, 200)


In [308]:
def index_lookup(x):
    try:
        return word2index[x]
    except KeyError:
        return word2index["<UNK>"]

In [510]:
X_plots_ind = [([index_lookup(x) for x in sentence], i) for sentence, i in X_plots]
X_plots_ind = np.array(X_plots_ind)

In [596]:
def generator(samples, batch_size = 32, n_neg = 5, max_len = 100):
    num_samples = samples.shape[0]
    
    ind = np.arange(num_samples)

    while True:
        samples = sk_shuffle(samples)
        for offset in range(0, num_samples, batch_size):
            
            X_sent = []
            X_movie = []
            y_out = [] 
            
            # Sample positive examples
            batch_samples = samples[offset:offset + batch_size]
            sentences = batch_samples[:, 0]
            movie_indices = batch_samples[:, 1]
            X_sent.extend(sentences)
            X_movie.extend(movie_indices)
            y_out.extend([1]*batch_size)
            
            # Sample negative examples
            keep_indx =  np.random.choice(ind, batch_size*n_neg, replace=False)
            neg_samples = samples[keep_indx]
            sentences = neg_samples[:, 0]
            movie_indices = np.repeat(movie_indices, n_neg)
            X_sent.extend(sentences)
            X_movie.extend(movie_indices)
            y_out.extend([0]*(batch_size*n_neg))
             
            # Pad zeros
            X_sent = pad_sequences(X_sent, maxlen=max_len)
             
            yield sk_shuffle(X_sent.reshape(-1, max_len, 1), np.array(X_movie), np.array(y_out))

In [597]:
t = generator(X_plots_ind)

In [598]:
X_sent, X_movies, y = next(t)

X_sent.shape, X_movies.shape, y.shape

((192, 100, 1), (192,), (192,))

tfidf = TfidfVectorizer(tokenizer=Tokenizer(), min_df = 10)

X_tfidf = tfidf.fit_transform(df["Plot"])

f = np.array(tfidf.get_feature_names())

def get_movie_terms(x, top_n=50):
    try:
        i = df.query("Title == @x").index[0]
        print(df.iloc[i]["Title"])
        print("Director: %s" %df.iloc[i]["Director"])
        print("Genre: %s" %df.iloc[i]["Genre"])
        print("\nTop Terms")
        sorted_indices = np.array(X_tfidf[i].todense()).argsort()

        print(f[sorted_indices].flatten()[::-1][:top_n])
        print("\n")
    except IndexError:
        print("Movie Title Not Found")

get_movie_terms("Get Out", 100)

In [32]:
df[10:42]["Plot"]

10    On May 26, 1987, Jenna Rink, a gawky girl, yea...
11    The film opens with a clown (Buscemi) whose wi...
12    Ex-convicts Emil Slovak (Karel Roden) and Oleg...
13    The movie starts with Dylan Branson (Michiel H...
14    Real estate agent Frank Mollard won't admit it...
15    The film starts with Diana Watts (Lindsay Burd...
16    In the late 22nd century, rising sea levels fr...
17    Senior college student Katie Burke (Holmes) is...
18    The film follows Curtis Clemins (Clint Palmer)...
19    When a widowed reporter is informed by an FBI ...
20    The film begins in an unspecified year of the ...
21    Adam Raki (Dancy) is a young man with Asperger...
22    Master thief Max Burdett (Pierce Brosnan) and ...
23    After a reckless lie sets off a catastrophic c...
24    Micha and Charlotte are a couple who have rece...
25    Angst tells the story of a group of horror fil...
26    When high-flying 27-year-old[2] Melbourne base...
27    While escaping a political prison on a min

In [179]:
s = tokenize_text(df.iloc[0]["Plot"])

In [180]:
np.random.choice(s)

['neal',
 'asks',
 'one',
 'of',
 'the',
 'policemen',
 'who',
 'he',
 'was',
 'and',
 'replies',
 'that',
 'he',
 'was',
 'a',
 'st.',
 'louis',
 'law',
 'school',
 'student',
 'who',
 'went',
 'insane',
 'and',
 'murdered',
 'his',
 'own',
 'father']

In [61]:
list(t)

ValueError: a must be 1-dimensional

In [115]:
def generator(samples, batch_size = 32, angle_offset = 0.2):
    num_samples = len(samples)
    while 1:
        shuffle(samples)
        for offset in range(0, num_samples, batch_size):
            batch_samples = samples[offset:offset + batch_size]
            
            images = []
            angles = []
            for batch_sample in batch_samples:
                # Center Image
                name = "./IMG/" + batch_sample[0].split("\\")[-1]
                center_image = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
                center_angle = float(batch_sample[3])
                images.append(center_image)
                angles.append(center_angle)
                
                # Flip image
                images.append(np.fliplr(center_image))
                angles.append(-center_angle)
                
                # Left Image
                name = "./IMG/" + batch_sample[1].split("\\")[-1]
                left_image = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
                left_angle = float(batch_sample[3]) + angle_offset
                images.append(left_image)
                angles.append(left_angle)
                
                # Flip image
                images.append(np.fliplr(left_image))
                angles.append(-left_angle)
                
                
                # Right Image
                name = "./IMG/" + batch_sample[2].split("\\")[-1]
                right_image = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
                right_angle = float(batch_sample[3]) - angle_offset
                images.append(right_image)
                angles.append(right_angle)
                
                # Flip image
                images.append(np.fliplr(right_image))
                angles.append(-right_angle)
                
                
            X_train = np.array(images)
            y_train = np.array(angles)
            yield sk_shuffle(X_train, y_train)

Get Out
Jordan Peele


array(['chris', 'rose', 'georgina', 'missy', 'logan', 'rod', 'walter',
       'the', 'jeremy', 'black', 'dean', 'white', 'to', 'hudson', 'and',
       'a', 'he', 'sunken', 'his', 's', 'awakens', 'hypnosis', 'deer',
       'estate', 'but', 'flash', 'roman', 'phone', 'possessed', 'him',
       'in', 'chair', 'of', 'photo', 'house', 'jim', 'family', 'with',
       'goes', 'hypnotherapy', 'antlers', 'behavior', 'neurosurgeon',
       'strange', 'car', 'into', 'people', 'gettogether', 'contradicting',
       'unplugs'], dtype='<U20')

In [70]:
l = [1,2,3]

In [79]:
l[::-1]

[3, 2, 1]

In [37]:
d = tokenize_text(x)

In [41]:
d[1]

['she',
 'has',
 'her',
 'mind',
 'set',
 'on',
 'finding',
 'a',
 'tall',
 'strong',
 'man',
 'to',
 'marry',
 'one',
 'that',
 'can',
 'wear',
 'a',
 'trojan',
 'shirt',
 'with',
 'a',
 'neck',
 'size']

In [56]:
X_plots = []
for plot in df["Plot"].iloc[:10]:
    X_plots.append(tokenize_text(plot))
    
    

In [72]:
d=np.array(X_plots)

In [73]:
dir(d)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_finalize__',
 '__array_interface__',
 '__array_prepare__',
 '__array_priority__',
 '__array_struct__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__complex__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__ilshift__',
 '__imatmul__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_e

In [77]:
d.ravel().tolist().shape

AttributeError: 'list' object has no attribute 'shape'