In [2]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

In [4]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

In [5]:
movies[1][0]

'The Revenant (2015 film)'

In [6]:
movies[1][1]

{'image': 'The Revenant 2015 film poster.jpg',
 'name': 'The Revenant',
 'cinematography': 'Emmanuel Lubezki',
 'language': 'English',
 'distributor': '20th Century Fox',
 'caption': 'Theatrical release poster',
 'gross': '$533 million',
 'country': 'United States',
 'director': 'Alejandro G. Iñárritu',
 'budget': '$135 million',
 'editing': 'Stephen Mirrione',
 'runtime': '156 minutes'}

In [7]:
movies[1][2]

['Alejandro González Iñárritu',
 'Arnon Milchan',
 'Steve Golin',
 'Mary Parent',
 'Keith Redmon',
 'James W. Skotchdopole',
 'Mark L. Smith',
 'The Revenant (novel)',
 'Michael Punke',
 'Leonardo DiCaprio',
 'Tom Hardy',
 'Domhnall Gleeson',
 'Will Poulter',
 'Ryuichi Sakamoto',
 'Alva Noto',
 'Emmanuel Lubezki',
 'Stephen Mirrione',
 '20th Century Fox',
 'TCL Chinese Theatre',
 'British Board of Film Classification',
 'TheWrap',
 'Box Office Mojo',
 'Regency Enterprises',
 'RatPac-Dune Entertainment',
 'Anonymous Content',
 'Appian Way Productions',
 'Biographical film',
 'Business Insider',
 'Western (genre)',
 'Alejandro González Iñárritu',
 'Mark L. Smith',
 'Michael Punke',
 'The Revenant (novel)',
 'frontiersman',
 'Hugh Glass',
 'Leonardo DiCaprio',
 'Tom Hardy',
 'Domhnall Gleeson',
 'Will Poulter',
 'Akiva Goldsman',
 'Principal photography',
 'TCL Chinese Theatre',
 'limited release',
 'wide release',
 'cinematography',
 'Golden Globe Awards',
 'BAFTA Awards',
 '88th Academy

In [8]:
top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
idx_to_link = {idx: link for link, idx in link_to_idx.items()}

movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
idx_to_movie = {idx: movie for movie, idx in movie_to_idx.items()}

pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

(949544, 66913, 10000)

In [9]:
pairs[1]

idx_to_link[1]
idx_to_movie[0]

(1, 0)

'Simon Kinberg'

'Deadpool (film)'

In [10]:
pairs[1000]
idx_to_link[564]
idx_to_movie[3]

(564, 3)

'Hoyte van Hoytema'

'Spectre (2015 film)'

In [12]:
pairs[:10]

[(0, 0),
 (1, 0),
 (2, 0),
 (3, 0),
 (4, 0),
 (5, 0),
 (6, 0),
 (7, 0),
 (8, 0),
 (9, 0)]

In [11]:
len(pairs_set)

671403

In [None]:
def movie_embedding_model(embedding_size=50):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()
model.summary()

In [None]:
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=10):
    """Generate batches of positive and negative samples"""
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    # Generator yields batches
    while True:
        # Sample random positive samples
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            # The labels for these samples are positive (1)
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        
        # Continue adding negative samples until batch size is reached
        while idx < batch_size:
            # Sample a random movie
            movie_id = random.randrange(len(movie_to_idx))
            # Sample a random link
            link_id = random.randrange(len(top_links))
            # If the link is not on the movie page, this is a negative sample
            if not (link_id, movie_id) in pairs_set:
                # Set the label as negative (-1)
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        # Make sure to shuffle the order of the positive and negative samples
        np.random.shuffle(batch)
        # Yield the links, movies, and labels
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

next(batchifier(pairs, positive_samples=3, negative_ratio=2))

In [None]:
positive_samples_per_batch = 512

# Fit the model to samples from the generator
model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=10),
    epochs=15,
    # Number of batches to grab every epoch
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

In [None]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, 'Movie:', movies[c][0], 'Rating:', movies[c][-1], 'Distance:', dists[c])
    return dists

d = similar_movies('Rogue One')

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components = 2, perplexity = 10, verbose = 1).fit_transform(normalized_movies)

In [None]:
ratings = [int(r[3][:-1]) if r[3] is not None else np.nan for r in movies]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
import seaborn as sns

xs = tsne[:, 0]
ys = tsne[:, 1]

plt.figure(figsize = (10, 8))
plt.scatter(xs, ys, c = ratings)

plot_movies = random.sample(movie_to_idx.keys(), 20)

for m in plot_movies:
    index = movie_to_idx[m]
    plt.text(x = tsne[index, 0], y = tsne[index, 1], s = m, fontsize = 10);
    
plt.colorbar();

In [None]:
link_counts

In [None]:
movies[1]

In [None]:
list(reversed(np.argsort(d)[-10:]))

In [None]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]

# Find magnitudes
link_lengths = np.linalg.norm(link_weights, axis=1)
# Divide by magnitudes (normalization)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link: str) -> None:
    """Find the most similar links"""
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('George Lucas')

In [None]:
similar_links('Category:Films based on American novels')

In [None]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
y = np.asarray([1 for _ in best] + [0 for _ in worst])
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])
X.shape

In [None]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y) 

In [None]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])


In [None]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

In [None]:
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
regr = LinearRegression()
regr.fit(rotten_X[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

In [None]:
error = (regr.predict(rotten_X[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

In [None]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

In [None]:
def gross(movie):
    v = movie[1].get('gross')
    if not v or not ' ' in v:
        return None
    v, unit = v.split(' ', 1)
    unit = unit.lower()
    if not unit in ('million', 'billion'):
        return None
    if not v.startswith('$'):
        return None
    try:
        v = float(v[1:])
    except ValueError:
        return None
    if unit == 'billion':
        v *= 1000
    return v

movie_gross = [gross(m) for m in movies]
movie_gross = np.asarray([gr for gr in movie_gross if gr is not None])
highest = np.argsort(movie_gross)[-10:]
for c in reversed(highest):
    print(c, movies[c][0], movie_gross[c])

In [None]:
gross_y = np.asarray([gr for gr in movie_gross if gr])
gross_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie, gr in zip(movies, movie_gross) if gr])

In [None]:
TRAINING_CUT_OFF = int(len(gross_X) * 0.8)
regr = LinearRegression()
regr.fit(gross_X[:TRAINING_CUT_OFF], gross_y[:TRAINING_CUT_OFF])

In [None]:
error = (regr.predict(gross_X[TRAINING_CUT_OFF:]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

In [None]:
error = (np.mean(gross_y[:TRAINING_CUT_OFF]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)