## Train Doc2Vec

In [1]:
import pandas as pd
import numpy as np
import re
from random import shuffle
import time
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [2]:
df = pd.read_csv("../data/wiki_movie_plots.csv")

In [3]:
# Do some minor preprocessing of Genre and Director
def genre_clean(x, tokenizer=TreebankWordTokenizer):
    x = x.lower()
    x = [w for w in tokenizer().tokenize(x) if re.match("[A-Za-z]", w) is not None]
    x = ", ".join(x)
    return x

df["Genre"].fillna("Unknown", inplace=True)
df["Genre"] = df["Genre"].apply(lambda x: genre_clean(x))

df["Director"].fillna("Uknown", inplace=True)

In [4]:
# Top 20 Genres by Frequency
df["Genre"].value_counts()[:20]

drama               313
comedy              209
unknown              58
western              37
adventure            33
romantic, comedy     30
romance              30
comedy, drama        27
musical              26
musical, comedy      25
melodrama            18
mystery              16
crime                13
crime, drama         12
historical           10
romance, drama        9
horror                9
swashbuckler          8
war                   8
comedy, short         8
Name: Genre, dtype: int64

In [5]:
# Lowercase, Stem, and Tokenize Plot
def stem_tokenize(x, stemmer = PorterStemmer, word_tokenizer=TreebankWordTokenizer, 
                  sent_tokenizer=sent_tokenize):
    
    x = x.lower()
    sent_tokens = [word_tokenizer().tokenize(s) for s in sent_tokenizer(x)]
    stemmed_tokens = [[stemmer().stem(w) for w in s if re.match("[A-Za-z]", w) is not None] 
                      for s in sent_tokens]
    return stemmed_tokens

In [6]:
df["Plot_tokens"] = df["Plot"].apply(lambda x: stem_tokenize(x))

In [9]:
"""
Create a list of tagged documents for Doc2Vec training, with each sentence treated as a document

Each sentence tag contains a unique index for the subject movie, the director, genre, and origin/ethnicity.
The model will find embeddings for all of these tag values

"""
tagged_docs = []

for i, doc in enumerate(df["Plot_tokens"]):
    
    genre, origin, director = df.iloc[i][["Genre", "Origin/Ethnicity", "Director"]]
    
    movie_indx = "movie index: %i" %i
    genre = "genre: %s" %genre.lower()
    origin = "origin: %s" %origin
    director = "director: %s" %director
    
    for w in doc:
        if len(w) > 0:
            tagged_docs.append(TaggedDocument(tags = [movie_indx, director, genre, origin], words = w))
        else:
            pass

In [14]:
# Define model
n_epochs = 50
vec_size = 200
alpha = 0.025

model = Doc2Vec(vector_size = vec_size, 
                alpha = alpha,
                min_count = 50, 
                dm=1)

model.build_vocab(tagged_docs)

In [16]:
# Train model
for epoch in np.arange(n_epochs):
    
    t0 = time.time()
    print("Iteration %i" %epoch)
    model.train(tagged_docs, total_examples= model.corpus_count,
                epochs= model.epochs)
    
    model.alpha -= 0.0002
    model.min_alpha = model.alpha
    
    shuffle(tagged_docs)
    print("Completed in %i seconds\n" %(time.time() - t0))

Iteration 0
Completed in 2 seconds

Iteration 1
Completed in 2 seconds

Iteration 2
Completed in 2 seconds

Iteration 3
Completed in 2 seconds

Iteration 4
Completed in 2 seconds



In [17]:
model.save("../models/doc2vec.model")