In [43]:
import tensorflow as tf
import pandas as pd 
import re
import collections
import os
import numpy as np

In [2]:
movies = pd.read_csv("wiki_movie_plots.csv")

Step one: Convert plot to vectorized sequence of words.

Step two: Set up LSTM to generate predictions.

Step three: Interpret predictions.

Ideas:
- Predictions at every word (or sentence, or paragraph), see where they change

In [3]:
movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [4]:
genres_to_consider = ["drama", "comedy", "horror", "action", "thriller", "romance", "western"]
movies = movies[movies['Genre'].isin(genres_to_consider)]

In [5]:
movies.groupby('Genre').count().sort_values("Title", ascending=False)

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Director,Cast,Wiki Page,Plot
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
drama,5964,5964,5964,5964,5841,5964,5964
comedy,4379,4379,4379,4379,4347,4379,4379
horror,1167,1167,1167,1167,1124,1167,1167
action,1098,1098,1098,1098,1087,1098,1098
thriller,966,966,966,966,955,966,966
romance,923,923,923,923,918,923,923
western,865,865,865,865,864,865,865


In [6]:
movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
14,1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...
15,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(fi...,The plot is that of a black woman going to the...
16,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_o...,On a beautiful summer day a father and mother ...


In [7]:
movies = movies.sample(frac=1) # Shuffles the data 

In [8]:
N_train = 600
N_test = 200

train = None
test = None

train = movies[movies['Genre'] == 'drama'][:N_train]
test = movies[movies['Genre'] == 'drama'][N_train:]

for genre in genres_to_consider[1:]:
    tr = movies[movies['Genre'] == genre][:N_train]
    te = movies[movies['Genre'] == genre][N_train:]
    pd.concat([train, tr])
    pd.concat([test, te])
    

In [9]:
train.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
21304,2012,The Most Fun You Can Have Dying,British,Kirstin Marcon,,drama,https://en.wikipedia.org/wiki/The_Most_Fun_You...,"The story begins in Hamilton, New Zealands fou..."
31693,1967,Sudigundaalu,Telugu,Adurthi Subba Rao,"Akkineni Nageswara Rao, Vijayachander, Ram Moh...",drama,https://en.wikipedia.org/wiki/Sudigundaalu,Judge Chandrasekharam is very kind and helpful...
2047,1936,Crack-Up,American,Malcolm St. Clair,"Peter Lorre, Brian Donlevy, Helen Wood",drama,https://en.wikipedia.org/wiki/Crack-Up_(1936_f...,"At the christening of the ""Wild Goose,"" an exp..."
15691,2009,Lymelife,American,Derick Martini,"Alec Baldwin, Emma Roberts, Rory Culkin, Kiera...",drama,https://en.wikipedia.org/wiki/Lymelife,"Set in 1979 Syosset, Long Island, New York, Ly..."
1157,1931,The Magnificent Lie,American,Berthold Viertel,"Ruth Chatterton, Ralph Bellamy",drama,https://en.wikipedia.org/wiki/The_Magnificent_Lie,"Recovering from a World War I head wound, sold..."


### Word Embeddings Using Word2Vec on Wikipedia Corpus

In [29]:
d = {
    "(" : "( ",
    ")" : " )",
    "-" : " - ",
    "," : " ,",
    "\n" : "",
    "\r" : "",
    "\"" : " \" ",
    "'" : " ' ",
    "." : " . ",
}

In [30]:
def multiple_replace(d, text):
    
    regex = re.compile("(%s)" % "|".join(map(re.escape, d.keys())))
    
    return regex.sub(lambda x: d[x.string[x.start():x.end()]], text) 

In [31]:
filename = "wiki"
words = []

files_in_directory = os.listdir(filename)
file = files_in_directory[0]
f = open(filename + "/" + file, 'r', encoding = "ISO-8859-1")
f = f.read()
f = multiple_replace(d, f)
all_words = f.split(" ")
for word in all_words:
    words.append(word)


<doc id="214730" title="Henry Hallam" nonfiltered="1" processed="1" dbindex="0">
Henry Hallam (July 9, 1777 - January 21, 1859) was an  English historian.

The only son of John Hallam, canon of Windso


Counter({'<doc': 1,
         'id=': 8792,
         '"': 132245,
         '214730': 1,
         '': 324582,
         'title=': 8782,
         'Henry': 525,
         'Hallam': 33,
         'nonfiltered=': 8782,
         '1': 4877,
         'processed=': 8782,
         'dbindex=': 8782,
         '0': 2792,
         '>Henry': 14,
         '(': 53967,
         'July': 1042,
         '9': 1293,
         ',': 201921,
         '1777': 26,
         '-': 68030,
         'January': 1203,
         '21': 723,
         '1859': 69,
         ')': 37852,
         'was': 32743,
         'an': 11753,
         'English': 1306,
         'historian': 114,
         '.': 225147,
         'The': 33524,
         'only': 3460,
         'son': 904,
         'of': 118333,
         'John': 2114,
         'canon': 37,
         'Windsor': 59,
         'and': 92198,
         'dean': 15,
         'Bristol': 62,
         'he': 10959,
         'educated': 117,
         'at': 14181,
         'Eton': 9,
         'Christ': 

In [37]:
def build_batch(words, n_words):
    word_count = [["UNK", -1]]
    word_count.extend(collections.Counter(words).most_common(n_words - 1))
    
    d = {}
    for w, _ in word_count:
        d[w] = len(d)
        
    data = []
    num_unks = 0
    for w in words:
        index = d.get(w)
        if index == 0:
            num_unks += 1
        data.append(index)
            
    word_count[0][1] = num_unks
    
    return data, word_count, d

In [38]:
n_words = 10000
data, word_count, vocab_dictionary = build_batch(words, n_words)

In [40]:
word_index = len(words)

In [45]:
# Sets up n-gram model
def generate_batch(batch_size, data, window):
    
    span = 2 * window + 1
    
    batch_labels = []
    batch_data = []
    
    for i in range(batch_size):
        
        index = np.random.randint(word_index) # Gets a random start place in index of all words 
        
        context_nums = [w + index for w in range(span) if w != window] # Gets context words place in data
        center_num = index + window # Gets center word place in data
        
        context_words = [words[w] for w in context_nums] # Gets actual context words
        center_word = words[center_num] # Gets actual center word
        
        context_words_ind = [vocab_dictionary[w] for w in context_words if w in vocav_dictionary else 1] # 0?
        center_word_ind = vocab_dictionary[center_word] if center_word in vocab_dictionary else 1 # 0?
        
        batch_labels.append(context_words_ind)
        batch_data.append(center_word_ind)
        
        print(context_words)
        print(center_word)
        
    return batch_data, batch_labels
        

In [46]:
generate_batch(1, data, 2)

[1778170, 1778171, 1778173, 1778174]
1778172


In [None]:
# Model to train embeddings

In [None]:
# Run model to obtain embeddings for each word 

In [None]:
# Save model embeddings so this part does not need to be run again 

In [None]:
# Set up LSTM