In [195]:
import tensorflow as tf
import pandas as pd 
import re
import collections
import os
import numpy as np
import math
import random

In [7]:
movies = pd.read_csv("wiki_movie_plots.csv")

Step one: Convert plot to vectorized sequence of words.

Step two: Set up LSTM to generate predictions.

Step three: Interpret predictions.

Ideas:
- Predictions at every word (or sentence, or paragraph), see where they change

In [8]:
movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [9]:
genres_to_consider = ["drama", "comedy", "horror", "action", "thriller", "romance", "western"]
movies = movies[movies['Genre'].isin(genres_to_consider)]

In [10]:
movies.groupby('Genre').count().sort_values("Title", ascending=False)

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Director,Cast,Wiki Page,Plot
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
drama,5964,5964,5964,5964,5841,5964,5964
comedy,4379,4379,4379,4379,4347,4379,4379
horror,1167,1167,1167,1167,1124,1167,1167
action,1098,1098,1098,1098,1087,1098,1098
thriller,966,966,966,966,955,966,966
romance,923,923,923,923,918,923,923
western,865,865,865,865,864,865,865


In [11]:
movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
14,1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...
15,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(fi...,The plot is that of a black woman going to the...
16,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_o...,On a beautiful summer day a father and mother ...


In [12]:
movies = movies.sample(frac=1) # Shuffles the data 

In [214]:
N_train = 600
N_test = 200

train = None
test = None

train = movies[movies['Genre'] == 'drama'][:N_train]
test = movies[movies['Genre'] == 'drama'][N_train:]

for genre in genres_to_consider[1:]:
    tr = movies[movies['Genre'] == genre][:N_train]
    te = movies[movies['Genre'] == genre][N_train:]
    train = pd.concat([train, tr])
    test = pd.concat([test, te])
    

In [215]:
train.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
15084,2006,Stephanie Daley,American,Hilary Brougher,"Amber Tamblyn, Tilda Swinton, Timothy Hutton",drama,https://en.wikipedia.org/wiki/Stephanie_Daley,Sixteen-year-old Stephanie Daley collapses in ...
17686,2000,"Dish, TheThe Dish",Australian,Rob Sitch,Sam Neill\r\nPatrick Warburton\r\nTom Long\r\n...,drama,https://en.wikipedia.org/wiki/The_Dish,The radio telescope at Parkes (Parkes Observat...
7075,1958,The Missouri Traveler,American,Jerry Hopper,"Brandon deWilde, Lee Marvin",drama,https://en.wikipedia.org/wiki/The_Missouri_Tra...,Brandon deWilde leads a cast lengthy in charac...
25002,1977,Bhumika,Bollywood,Shyam Benegal,"Smita Patil, Naseeruddin Shah, Amrish Puri",drama,https://en.wikipedia.org/wiki/Bhumika_(1977_film),"Bhumika tells the life story of an actress, Us..."
12575,1995,To the Limit,American,Raymond Martino,"Anna Nicole Smith, Michael Nouri",drama,https://en.wikipedia.org/wiki/To_the_Limit_(19...,"Anna Nicole Smith plays Vickie Lynn, an ex-CIA..."


### Word Embeddings Using Word2Vec on Wikipedia Corpus

In [15]:
d = {
    "(" : "( ",
    ")" : " )",
    "-" : " - ",
    "," : " ,",
    "\n" : "",
    "\r" : "",
    "\"" : " \" ",
    "'" : " ' ",
    "." : " . ",
    "ENDOFARTICLE": ""
}

### Function to replace various characters in a single pass over text

In [16]:
def multiple_replace(d, text):
    
    regex = re.compile("(%s)" % "|".join(map(re.escape, d.keys())))
    
    return regex.sub(lambda x: d[x.string[x.start():x.end()]], text) 

In [184]:
filename = "wiki"
words = []

files_in_directory = os.listdir(filename)
print("Files in directory: " + str(len(files_in_directory)))
print("Files being used: " + str(10))

for file in files_in_directory[:10]:
    f = open(filename + "/" + file, 'r', encoding = "ISO-8859-1")
    f = f.read()
    f = multiple_replace(d, f)
    f = re.sub("<doc.{20,150}>", "", f)
    f = re.sub("</doc>", "", f)
    all_words = f.split(" ")
    for word in all_words:
        words.append(word)


Files in directory: 164
Files being used: 10


In [157]:
def build_batch(words, n_words):
    word_count = [["UNK", -1]]
    word_count.extend(collections.Counter(words).most_common(n_words - 1))
    
    d = {}
    for w, _ in word_count:
        d[w] = len(d)
        
    data = []
    num_unks = 0
    for w in words:
        index = d.get(w, 0)
        if index == 0:
            num_unks += 1
        data.append(index)
            
    word_count[0][1] = num_unks
    
    reversed_dictionary = dict(zip(d.values(), d.keys()))
    
    return data, word_count, d, reversed_dictionary

In [186]:
n_words = 100000 # Subject to change 
data, word_count, vocab_dictionary, reversed_dictionary = build_batch(words, n_words)

In [187]:
word_index = len(words)

In [188]:
data_index = 0
def generate_batch(batch_size, data, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
            
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [141]:
# Model to train embeddings

In [189]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2
num_sampled = 64

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [190]:
graph = tf.Graph()

with graph.as_default():
    
    with tf.name_scope('inputs'):
      train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
      train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
      valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    with tf.name_scope('embeddings'):
        embeddings = tf.Variable(
            tf.random_uniform([n_words, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
    with tf.name_scope("weights"):
        
        nce_weights = tf.Variable(tf.truncated_normal([n_words, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        
    with tf.name_scope('biases'):
        
        nce_biases = tf.Variable(tf.zeros([n_words]))
        
    with tf.name_scope('loss'):
      loss = tf.reduce_mean(
          tf.nn.nce_loss(
              weights=nce_weights,
              biases=nce_biases,
              labels=train_labels,
              inputs=embed,
              num_sampled=num_sampled,
              num_classes=n_words))
    
    tf.summary.scalar('loss', loss)
    
    with tf.name_scope('optimizer'):
      optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)
    
    merged = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    
    saver = tf.train.Saver()
    

# Run model to obtain embeddings for each word 

In [196]:
n_steps = 100001 # Can increase 
log_dir = "182/LSTMsAndInterpretability"

with tf.Session(graph=graph) as session:
    
    writer = tf.summary.FileWriter(log_dir, session.graph)
    
    init.run()
    print('Initialized')
    
    average_loss = 0
    for step in range(n_steps):
        batch_inputs, batch_labels = generate_batch(batch_size,
                                                  data,
                                                  num_skips,
                                                  skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        run_metadata = tf.RunMetadata()
    
        _, summary, loss_val = session.run([optimizer, merged, loss],
                                         feed_dict=feed_dict,
                                         run_metadata=run_metadata)
    
        average_loss += loss_val
    
        if step % 5000 == 0:
            if step > 0:
                average_loss /= 5000
            # The average loss is an estimate of the loss over the last 5000
            # batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
            
    final_embeddings = normalized_embeddings.eval()
        
    with open(log_dir + '/metadata.tsv', 'w') as f:
        for i in range(n_words):
            f.write(reversed_dictionary[i] + '\n')
        
    saver.save(session, os.path.join(log_dir, 'model.ckpt'))

Initialized
Average loss at step  0 :  267.1926574707031
Average loss at step  2000 :  150.12939685058595
Average loss at step  4000 :  87.6103204126358
Average loss at step  6000 :  62.80463714456558
Average loss at step  8000 :  49.22242328047752
Average loss at step  10000 :  39.331454894900325
Average loss at step  12000 :  36.64724566411972
Average loss at step  14000 :  27.990575132846832
Average loss at step  16000 :  23.667969947099685
Average loss at step  18000 :  19.935047299623488
Average loss at step  20000 :  17.869345844388008
Average loss at step  22000 :  16.380260180354117
Average loss at step  24000 :  13.991367206811905
Average loss at step  26000 :  12.450434812963008
Average loss at step  28000 :  12.40612879383564
Average loss at step  30000 :  10.860905973792075
Average loss at step  32000 :  10.349412953615188
Average loss at step  34000 :  9.506500377058982
Average loss at step  36000 :  9.001470514655113
Average loss at step  38000 :  8.640172551155091
Averag

In [225]:
cowboy = final_embeddings[vocab_dictionary['cowboy']]
gun = final_embeddings[vocab_dictionary['gun']]
random_word = final_embeddings[vocab_dictionary['building']]
print(np.linalg.norm(cowboy - gun))
print(np.linalg.norm(cowboy - random_word))

1.148621
1.317841


In [None]:
# Save model embeddings so this part does not need to be run again 

In [None]:
# Set up LSTM

In [206]:
train.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
15084,2006,Stephanie Daley,American,Hilary Brougher,"Amber Tamblyn, Tilda Swinton, Timothy Hutton",drama,https://en.wikipedia.org/wiki/Stephanie_Daley,Sixteen-year-old Stephanie Daley collapses in ...
17686,2000,"Dish, TheThe Dish",Australian,Rob Sitch,Sam Neill\r\nPatrick Warburton\r\nTom Long\r\n...,drama,https://en.wikipedia.org/wiki/The_Dish,The radio telescope at Parkes (Parkes Observat...
7075,1958,The Missouri Traveler,American,Jerry Hopper,"Brandon deWilde, Lee Marvin",drama,https://en.wikipedia.org/wiki/The_Missouri_Tra...,Brandon deWilde leads a cast lengthy in charac...
25002,1977,Bhumika,Bollywood,Shyam Benegal,"Smita Patil, Naseeruddin Shah, Amrish Puri",drama,https://en.wikipedia.org/wiki/Bhumika_(1977_film),"Bhumika tells the life story of an actress, Us..."
12575,1995,To the Limit,American,Raymond Martino,"Anna Nicole Smith, Michael Nouri",drama,https://en.wikipedia.org/wiki/To_the_Limit_(19...,"Anna Nicole Smith plays Vickie Lynn, an ex-CIA..."


In [216]:
def encodeLabel(df, col, label_col="Label"):
    df[col] = df[col].astype('category')
    df[label_col] = df[col].cat.codes
    return df

In [217]:
train = encodeLabel(train, "Genre")

In [238]:
train_plot_words = train['Plot'].tolist()
train_labels = train['Label'].tolist()
train_labels = np.array(train_labels)

In [230]:
# Encode plots with word embeddings 

In [288]:
def get_embeddings(plot_list, final_embeddings):
    plot_embeddings = []
    
    for plot in plot_list:
    
        embeddings = []
    
        p = multiple_replace(d, plot)
    
        all_words = p.split(" ")
    
        for word in all_words:
        
            index = vocab_dictionary.get(word, 0)
        
            embedding = final_embeddings[index]
            embeddings.append(embedding)
            
        plot_embeddings.append(embeddings)
        
    return plot_embeddings

In [289]:
train_plot_embeddings = get_embeddings(train_plot_words, final_embeddings)
train_plot_embeddings = np.array(train_plot_embeddings)

KeyboardInterrupt: 

In [None]:
train_plot_embeddings

In [240]:
def generate_batch_movies(batch_size, plots, labels):
    # Assert ndarrays 
    
    total = len(plots)
    
    indices = np.random.choice(total, batch_size, replace=False)
    
    batch_plots = np.take(plots, indices)
    batch_labels = np.take(labels, indices)
    
    return batch_plots, batch_labels

In [241]:
batch_plots, batch_labels = generate_batch_movies(20, train_plot_embeddings, train_labels)

In [254]:
print(batch_plots.shape)

(20,)


In [290]:
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [252]:
class LSTMModel():
    
    def __init__(self, rnn_size, output_size, learning_rate=1e-4):

        self.inputs = tf.placeholder(tf.float32, shape=[None, None, embedding_size])
        self.labels = tf.placeholder(tf.float32, shape=[None, 1])
    
        lm_cell = tf.nn.rnn_cell.LSTMCell(rnn_size)
    
        outputs, states = tf.nn.dynamic_rnn(lm_cell, inputs, dtype=tf.float32)
    
        self.output_logits = tf.layers.dense(outputs, output_size)
    
        self.loss = tf.losses.sparse_softmax_cross_entropy(self.labels, self.output_logits)
        
        optimizer = tf.train.AdamOptimizer(learning_rate)
        
        self.global_step = tf.train.get_or_create_global_step()
        self.train_op = optimizer.minimize(self.loss)
        self.saver = tf.train.Saver()


In [253]:
tf.reset_default_graph() # This is so that when you debug, you reset the graph each time you run this, in essence, cleaning the board
model = LSTMModel(256, 7, 1e-3)

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


ValueError: Expected input tensor Tensor("Placeholder:0", shape=(128,), dtype=int32) to have rank at least 2