In [1]:
import tensorflow as tf
import pandas as pd 
import re
import collections
import os
import numpy as np
import math
import random
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import json

In [2]:
def get_dicts():
    with open('vocab_dictionary.json', 'r') as f:
        vocab_dictionary = json.load(f)
    with open('reversed_dictionary.json', 'r') as f:
        reversed_dictionary = json.load(f)
    with open('embeddings.json', 'r') as f:
        final_embeddings = json.load(f)
    return vocab_dictionary, reversed_dictionary, final_embeddings

In [3]:
vocab_dictionary, reversed_dictionary, final_embeddings = get_dicts()

In [4]:
movies = pd.read_csv("wiki_movie_plots.csv")

In [5]:
movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [55]:
genres_to_consider = ["drama", "comedy", "horror", "action", "thriller", "romance", "western"]
output_size = len(genres_to_consider)
movies = movies[movies['Genre'].isin(genres_to_consider)]

In [7]:
movies.groupby('Genre').count().sort_values("Title", ascending=False)

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Director,Cast,Wiki Page,Plot
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
drama,5964,5964,5964,5964,5841,5964,5964
comedy,4379,4379,4379,4379,4347,4379,4379
horror,1167,1167,1167,1167,1124,1167,1167
action,1098,1098,1098,1098,1087,1098,1098
thriller,966,966,966,966,955,966,966
romance,923,923,923,923,918,923,923
western,865,865,865,865,864,865,865


In [8]:
movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
14,1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...
15,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(fi...,The plot is that of a black woman going to the...
16,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_o...,On a beautiful summer day a father and mother ...


In [9]:
movies = movies.sample(frac=1) # Shuffles the data 

In [10]:
N_train = 600
N_test = 200

train = None
test = None

train = movies[movies['Genre'] == 'drama'][:N_train]
test = movies[movies['Genre'] == 'drama'][N_train:]

for genre in genres_to_consider[1:]:
    tr = movies[movies['Genre'] == genre][:N_train]
    te = movies[movies['Genre'] == genre][N_train:]
    train = pd.concat([train, tr])
    test = pd.concat([test, te])

## Function to encode the genre to be used for LSTM classification

In [11]:
def encodeLabel(df, col, label_col="Label"):
    df[col] = df[col].astype('category')
    df[label_col] = df[col].cat.codes
    return df

In [12]:
train = encodeLabel(train, "Genre")
test = encodeLabel(test, "Genre")

In [13]:
train_plot_words = train['Plot'].tolist()
train_labels = train['Label'].tolist()
train_labels = np.array(train_labels)
test_plot_words = test['Plot'].tolist()
test_labels = test['Label'].tolist()
test_labels = np.array(test_labels)

In [14]:
d = {
    "(" : " ( ",
    ")" : " ) ",
    "-" : " - ",
    "," : " , ",
    "\n" : "",
    "\r" : "",
    "\"" : " \" ",
    "'" : " ' ",
    "." : " . ",
    ";" : " ; ",
    ":" : " : ",
    "ENDOFARTICLE": ""
}

In [15]:
def multiple_replace(d, text):
    
    regex = re.compile("(%s)" % "|".join(map(re.escape, d.keys())))
    
    return regex.sub(lambda x: d[x.string[x.start():x.end()]], text) 

## Maps each word in a given plot to its embedding 

In [16]:
embedding_size = 512
max_len = 7000

In [17]:
def get_embeddings(plot_list, final_embeddings):
    plot_embeddings = []
    
    for plot in plot_list:
    
        embeddings = []
    
        p = multiple_replace(d, plot)
    
        all_words = p.split(" ")
    
        for word in all_words:
        
            index = vocab_dictionary.get(word.lower(), 0)
        
            embedding = final_embeddings[str(index)]
            embeddings.append(embedding)
            
        plot_embeddings.append(embeddings)
        
    return plot_embeddings

In [34]:
train_plot_embeddings = get_embeddings(train_plot_words, final_embeddings)
train_plot_embeddings = np.array(train_plot_embeddings)

In [19]:
test_plot_embeddings = get_embeddings(test_plot_words, final_embeddings)
test_plot_embeddings = np.array(test_plot_embeddings)

## Generates a batch of movies to be used for LSTM model

In [71]:
def generate_batch(batch_size, values, labels):
    
    total = len(values)
    
    indices = np.random.choice(total, batch_size, replace=False)
    
    batch_values = np.take(values, indices)
    
    batch_labels = np.take(labels, indices)
    
    values = []
    
    pads = [[0 for _ in range(embedding_size)]]
    
    for val in batch_values:
        
        left = max_len - len(val)
        
        p = pads * left 
        
        val.extend(p)
        
        values.append(val)
    
    return values, np.reshape(batch_labels, (batch_size, 1))

In [72]:
vals, labels = generate_batch(1, train_plot_embeddings, train_labels)

In [84]:
class LSTMModel():
    
    def __init__(self, rnn_size, output_size, learning_rate=1e-4):

        self.inputs = tf.placeholder(tf.float32, shape=[None, max_len, embedding_size]) 
        self.labels = tf.placeholder(tf.int32, shape=[None, 1])
    
        lm_cell = tf.nn.rnn_cell.LSTMCell(rnn_size)
    
        outputs, states = tf.nn.dynamic_rnn(lm_cell, self.inputs, dtype=tf.float32) # add sequence length 
        
        print(states)
    
        self.output_logits = tf.layers.dense(states[1], output_size)
    
        self.loss = tf.losses.sparse_softmax_cross_entropy(self.labels, self.output_logits)
        
        optimizer = tf.train.AdamOptimizer(learning_rate)
        
        self.global_step = tf.train.get_or_create_global_step()
        self.train_op = optimizer.minimize(self.loss)
        self.saver = tf.train.Saver()

In [85]:
tf.reset_default_graph()
model = LSTMModel(256, output_size, 1e-3)

LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 256) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(?, 256) dtype=float32>)


In [86]:
print(output_size)

7


In [None]:
#inputs, labels = generate_batch(20, train_plot_embeddings, train_labels)
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    d = {model.inputs: inputs, model.labels: labels}
    
    for i in range(10):
    
        loss, _ = sess.run([model.loss, model.train_op], feed_dict = d)
    
        print(loss)

1.9459101
1.9419363
1.9358057
1.9247086


In [None]:
tf.reset_default_graph()
rnn_sizes = np.arange(128, 512, 4)
learning_rate = 1e-3
rnn_size_mean_error = []
rnn_size_min_error = []
    
for rnn_size in rnn_sizes:
    
    tf.reset_default_graph()
    
    with tf.Session() as sess:
    
        model = LSTMModel(rnn_size, output_size, learning_rate)
        sess.run(tf.global_variables_initializer())
        
        batch_size = 20
        loss = []
    
        for i in range(10):
        
            inputs, labels, masked = generate_batch(20, train_values, train_labels)
        
            for j in range(5):
    
                d = {model.inputs: inputs, model.labels: labels, model.labels_mask: masked}
        
                train_loss, _ = sess.run([model.loss, model.train_op], feed_dict=d)
        
            loss.append(train_loss)
            
            if i == 9:
                print("rnn size: " + str(rnn_size) + " loss: " + str(train_loss))
    
        rnn_size_mean_error.append(np.mean(loss))
        rnn_size_min_error.append(min(loss))