In [1]:
import tensorflow as tf
import pandas as pd 
import re
import collections
import os
import numpy as np
import math
import random
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from six.moves import xrange
from tempfile import gettempdir

In [2]:
movies = pd.read_csv("wiki_movie_plots.csv")

Step one: Convert plot to vectorized sequence of words.

Step two: Set up LSTM to generate predictions.

Step three: Interpret predictions.

Ideas:
- Predictions at every word (or sentence, or paragraph), see where they change

In [3]:
movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [4]:
genres_to_consider = ["drama", "comedy", "horror", "action", "thriller", "romance", "western"]
movies = movies[movies['Genre'].isin(genres_to_consider)]

In [5]:
movies.groupby('Genre').count().sort_values("Title", ascending=False)

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Director,Cast,Wiki Page,Plot
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
drama,5964,5964,5964,5964,5841,5964,5964
comedy,4379,4379,4379,4379,4347,4379,4379
horror,1167,1167,1167,1167,1124,1167,1167
action,1098,1098,1098,1098,1087,1098,1098
thriller,966,966,966,966,955,966,966
romance,923,923,923,923,918,923,923
western,865,865,865,865,864,865,865


In [6]:
movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
14,1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...
15,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(fi...,The plot is that of a black woman going to the...
16,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_o...,On a beautiful summer day a father and mother ...


In [7]:
movies = movies.sample(frac=1) # Shuffles the data 

In [8]:
N_train = 600
N_test = 200

train = None
test = None

train = movies[movies['Genre'] == 'drama'][:N_train]
test = movies[movies['Genre'] == 'drama'][N_train:]

for genre in genres_to_consider[1:]:
    tr = movies[movies['Genre'] == genre][:N_train]
    te = movies[movies['Genre'] == genre][N_train:]
    train = pd.concat([train, tr])
    test = pd.concat([test, te])
    

In [9]:
train.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
16447,2013,Big Sur,American,Michael Polish,"Josh Lucas, Jean-Marc Barr, Radha Mitchell",drama,https://en.wikipedia.org/wiki/Big_Sur_(film),"Jack Kerouac, coming off the recent success of..."
17591,1989,Bangkok Hilton,Australian,Ken Cameron,"Nicole Kidman, Denholm Elliott, Hugo Weaving, ...",drama,https://en.wikipedia.org/wiki/Bangkok_Hilton,Bangkok Hilton begins as Hal Stanton (Denholm ...
6106,1954,Hell and High Water,American,Samuel Fuller,"Richard Widmark, Bella Darvi",drama,https://en.wikipedia.org/wiki/Hell_and_High_Wa...,"In 1953, renowned French scientist Professor M..."
299,1919,True Heart Susie,American,D.W. Griffith,"Lillian Gish, Bobby Harron",drama,https://en.wikipedia.org/wiki/True_Heart_Susie,"As described in a film magazine,[2] ""True Hear..."
32066,2000,Kalisundam Raa,Telugu,Udayasankar,"Venkatesh, Simran",drama,https://en.wikipedia.org/wiki/Kalisundam_Raa,Raghaviah (K. Vishwanath) and Ram Mohan Rao (R...


### Word Embeddings Using Word2Vec on Wikipedia Corpus

In [10]:
d = {
    "(" : " ( ",
    ")" : " ) ",
    "-" : " - ",
    "," : " , ",
    "\n" : "",
    "\r" : "",
    "\"" : " \" ",
    "'" : " ' ",
    "." : " . ",
    ";" : " ; ",
    ":" : " : ",
    "ENDOFARTICLE": ""
}

### Function to replace various characters in a single pass over text

In [11]:
def multiple_replace(d, text):
    
    regex = re.compile("(%s)" % "|".join(map(re.escape, d.keys())))
    
    return regex.sub(lambda x: d[x.string[x.start():x.end()]], text) 

## Parses (k) available wikipedia files 

In [12]:
filename = "wiki"
words = []
k = 100

files_in_directory = os.listdir(filename)
print("Files in directory: " + str(len(files_in_directory)))
print("Files being used: " + str(k))

for file in files_in_directory[:k]:
    f = open(filename + "/" + file, 'r', encoding = "ISO-8859-1")
    f = f.read()
    f = multiple_replace(d, f)
    f = re.sub("<doc.{20,150}>", "", f) # Gets rid of intro 
    f = re.sub("</doc>", "", f) # Gets rid of end
    f = re.sub("\[[0-9]+\]", "", f) # Gets rid of reference pointers 
    f = re.sub(" [A-Z]{1}[A-Z]+ ", " ", f)
    all_words = f.split(" ")
    for word in all_words:
        words.append(word.lower())


Files in directory: 164
Files being used: 100


## Function to generate vocabulary instances 

In [13]:
def build_batch(words, n_words):
    word_count = [["UNK", -1]]
    word_count.extend(collections.Counter(words).most_common(n_words - 1))
    
    d = {}
    for w, _ in word_count:
        d[w] = len(d)
        
    data = []
    num_unks = 0
    for w in words:
        index = d.get(w, 0)
        if index == 0:
            num_unks += 1
        data.append(index)
            
    word_count[0][1] = num_unks
    
    reversed_dictionary = dict(zip(d.values(), d.keys()))
    
    return data, word_count, d, reversed_dictionary

In [14]:
n_words = 50000 # Subject to change 
data, word_count, vocab_dictionary, reversed_dictionary = build_batch(words, n_words)

In [None]:
word_index = len(words)

## Generates a batch of wikipedia data to be used for the Word2Vec model

In [None]:
data_index = 0
def generate_batch(batch_size, data, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
            
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

## Parameters for the Word2Vec Model

In [None]:
batch_size = 256
embedding_size = 128
skip_window = 1
num_skips = 2
num_sampled = 64

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [None]:
graph = tf.Graph()

with graph.as_default():
    
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    with tf.name_scope('embeddings'):
        embeddings = tf.Variable(
            tf.random_uniform([n_words, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
    with tf.name_scope("weights"):
        
        nce_weights = tf.Variable(tf.truncated_normal([n_words, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        
    with tf.name_scope('biases'):
        
        nce_biases = tf.Variable(tf.zeros([n_words]))
        
    with tf.name_scope('loss'):
      loss = tf.reduce_mean(
          tf.nn.nce_loss(
              weights=nce_weights,
              biases=nce_biases,
              labels=train_labels,
              inputs=embed,
              num_sampled=num_sampled,
              num_classes=n_words))
    
    tf.summary.scalar('loss', loss)
    
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)
    
    merged = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    
    saver = tf.train.Saver()
    

# Run model to obtain embeddings for each word 

In [None]:
n_steps = 250001 # Can increase 
log_dir = "182/LSTMsAndInterpretability"

with tf.Session(graph=graph) as session:
    
    writer = tf.summary.FileWriter(log_dir, session.graph)
    
    init.run()
    print('Initialized')
    
    average_loss = 0
    for step in range(n_steps):
        batch_inputs, batch_labels = generate_batch(batch_size,
                                                  data,
                                                  num_skips,
                                                  skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        run_metadata = tf.RunMetadata()
    
        _, summary, loss_val = session.run([optimizer, merged, loss],
                                         feed_dict=feed_dict,
                                         run_metadata=run_metadata)
    
        average_loss += loss_val
    
        if step % 5000 == 0:
            if step > 0:
                average_loss /= 5000
            # The average loss is an estimate of the loss over the last 5000
            # batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
            
    final_embeddings = normalized_embeddings.eval()
        
    with open(log_dir + '/metadata.tsv', 'w') as f:
        for i in range(n_words):
            f.write(reversed_dictionary[i] + '\n')
        
    saver.save(session, os.path.join(log_dir, 'model.ckpt'))

## Computes cosine similarity between two vectors

In [None]:
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

## Function finds the closest word embeddings to a given word and returns their indices

In [None]:
def find_closest(embeddings, word, index_to_word, word_to_index, n_words, count):
    assert word in word_to_index, 'Unknown word'
    
    print("Word occurs: " + str(count[word_to_index[word]][1]) + " times.\n")
    
    word_embedding = embeddings[word_to_index.get(word)]
    
    distances = np.sum((embeddings - word_embedding) ** 2, axis=1)
    
    indices = np.argsort(distances)[:n_words]
    for i in indices:
        print(index_to_word.get(i, "UNK"))
    
    return indices

In [None]:
find_closest(final_embeddings, "jesus", reversed_dictionary, vocab_dictionary, 20, word_count)

## Plots TSNE for most common words

In [None]:
def plot_with_labels(low_dim_embs, labels, filename):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label, xy = (x, y), xytext = (5, 2), textcoords = 'offset points', ha='right', va='bottom')
        
try:
    # pylint: disable=g-import-not-at-top
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    tsne = TSNE(
        perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
    plot_only = 500
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [reversed_dictionary[i] for i in xrange(plot_only)]
    plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(),
                                                        'tsne.png'))

except ImportError as ex:
    print('Please install sklearn, matplotlib, and scipy to show embeddings.')
    print(ex)

In [None]:
# Set up LSTM

In [None]:
train.head()

## Function to encode the genre to be used for LSTM classification

In [None]:
def encodeLabel(df, col, label_col="Label"):
    df[col] = df[col].astype('category')
    df[label_col] = df[col].cat.codes
    return df

In [None]:
train = encodeLabel(train, "Genre")

In [None]:
train_plot_words = train['Plot'].tolist()
train_labels = train['Label'].tolist()
train_labels = np.array(train_labels)

## Maps each word in a given plot to its embedding 

In [None]:
def get_embeddings(plot_list, final_embeddings):
    plot_embeddings = []
    
    for plot in plot_list:
    
        embeddings = []
    
        p = multiple_replace(d, plot)
    
        all_words = p.split(" ")
    
        for word in all_words:
        
            index = vocab_dictionary.get(word, 0)
        
            embedding = final_embeddings[index]
            embeddings.append(embedding)
            
        plot_embeddings.append(embeddings)
        
    return plot_embeddings

In [None]:
train_plot_embeddings = get_embeddings(train_plot_words, final_embeddings)
train_plot_embeddings = np.array(train_plot_embeddings)

## Generates a batch of movies to be used for LSTM model

In [None]:
def generate_batch_movies(batch_size, plots, labels):
    # Assert ndarrays 
    
    total = len(plots)
    
    indices = np.random.choice(total, batch_size, replace=False)
    
    batch_plots = np.take(plots, indices)
    batch_labels = np.take(labels, indices)
    
    return batch_plots, batch_labels

In [None]:
batch_inputs, batch_labels = generate_batch_movies(20, train_plot_embeddings, train_labels)

In [None]:
class LSTMModel():
    
    def __init__(self, rnn_size, output_size, learning_rate=1e-4):

        self.inputs = tf.placeholder(tf.float32, shape=[None, None, embedding_size])
        self.labels = tf.placeholder(tf.int32, shape=[None, 1])
    
        lm_cell = tf.nn.rnn_cell.LSTMCell(rnn_size)
    
        outputs, states = tf.nn.dynamic_rnn(lm_cell, self.inputs, dtype=tf.float32)
    
        self.output_logits = tf.layers.dense(outputs, output_size)
    
        self.loss = tf.losses.sparse_softmax_cross_entropy(self.labels, self.output_logits)
        
        optimizer = tf.train.AdamOptimizer(learning_rate)
        
        self.global_step = tf.train.get_or_create_global_step()
        self.train_op = optimizer.minimize(self.loss)
        self.saver = tf.train.Saver()


In [76]:
tf.reset_default_graph() # This is so that when you debug, you reset the graph each time you run this, in essence, cleaning the board
model = LSTMModel(256, 7, 1e-3)

Instructions for updating:
Use tf.cast instead.


In [None]:
# example
batch_size = 20
for i in range(10):
    
    batch_inputs, batch_labels = generate_batch_movies(batch_size, train_plot_embeddings, train_labels)
    
    feed_dict = {model.inputs: batch_inputs, model.labels: batch_labels}
    
    loss, _ = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
    
    print(loss)

In [15]:
import json

In [16]:
with open('vocab_dictionary.json', 'w') as fp:
    json.dump(vocab_dictionary, fp)
with open('reversed_dictionary.json', 'w') as fp:
    json.dump(reversed_dictionary, fp)
with open('wordcount.json', 'w') as fp:
    json.dump(word_count, fp)

In [23]:
dict(word_count)

{'UNK': 15430244,
 '': 59041086,
 'the': 19802872,
 '.': 19606110,
 ',': 18339344,
 'of': 10250348,
 'and': 8018020,
 'in': 7361301,
 ';': 7097066,
 'a': 6166615,
 '-': 5854060,
 'to': 5755138,
 ')': 4616133,
 '(': 4612980,
 '"': 3735775,
 'is': 3350666,
 'was': 2942168,
 "'": 2693478,
 'for': 2378036,
 ':': 2270358,
 'as': 2242207,
 's': 2194936,
 'on': 2022034,
 'by': 2009159,
 'with': 1920241,
 'he': 1628851,
 'that': 1622900,
 'from': 1526793,
 'it': 1426551,
 'at': 1380414,
 'his': 1378255,
 'an': 1101848,
 'are': 1027211,
 'were': 887042,
 'this': 886377,
 'or': 875241,
 'which': 852307,
 'be': 851776,
 'also': 845684,
 'has': 742065,
 'one': 657517,
 'had': 614012,
 'not': 600311,
 'first': 589432,
 'but': 583710,
 'new': 579807,
 'their': 575491,
 'who': 551742,
 'they': 548684,
 'have': 535416,
 'its': 507148,
 'after': 487937,
 'other': 471892,
 'all': 464436,
 'her': 459248,
 '1': 454320,
 'two': 443031,
 'there': 430888,
 'she': 427902,
 'when': 427703,
 'been': 415549,
 '2

In [21]:
word_count[0] = (word_count[0][0], word_count[0][1])

In [24]:
with open('wordcount.json', 'w') as fp:
    json.dump(word_count, fp)

In [26]:
with open('dataloc.txt', 'w') as f:
    for item in data:
        f.write("%s\n" % item)

In [25]:
data

[711,
 27651,
 1,
 13,
 293,
 216,
 4,
 1,
 9441,
 1,
 10,
 1,
 261,
 501,
 4,
 1,
 4903,
 12,
 1,
 16,
 31,
 1,
 246,
 3025,
 3,
 2,
 70,
 332,
 5,
 137,
 27651,
 4,
 1,
 4608,
 5,
 5529,
 6,
 2643,
 5,
 4210,
 4,
 1,
 25,
 16,
 2747,
 29,
 14116,
 6,
 2033,
 264,
 4,
 1,
 1528,
 4,
 1,
 5146,
 7,
 8372,
 3,
 1,
 144,
 11,
 2,
 214,
 4,
 1,
 25,
 16053,
 18,
 69,
 75,
 22,
 2,
 1528,
 2555,
 8,
 1,
 44,
 30,
 13514,
 33,
 2075,
 4,
 1,
 6,
 59,
 4,
 1,
 22,
 30,
 357,
 17,
 21,
 268,
 7,
 5116,
 4,
 1,
 25,
 5365,
 9,
 242,
 1862,
 7,
 11159,
 4,
 1,
 25,
 908,
 436,
 74,
 8476,
 11,
 1672,
 813,
 3,
 1,
 25,
 41,
 366,
 2047,
 24,
 2,
 6855,
 197,
 5,
 3233,
 6,
 5342,
 47,
 433,
 2,
 9104,
 203,
 4,
 1,
 9,
 2109,
 11,
 36,
 25,
 12911,
 30,
 4565,
 11,
 2,
 119,
 10,
 2239,
 6,
 2384,
 607,
 5,
 3909,
 5,
 7353,
 8,
 1,
 44,
 325,
 85,
 114,
 7,
 1354,
 436,
 3,
 1,
 1,
 25,
 16,
 4,
 1,
 113,
 4,
 1,
 31,
 997,
 6307,
 5,
 78,
 346,
 3059,
 10,
 1,
 10,
 998,
 5,
 26,
 36,
 1275,
