#Understand and order data

#Understand and order data

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
# We'll train a network that learns to predict a movie based on the outgoing links on the corresponding Wikipedia page. 
# This creates embeddings for the movies. This in turn lets us recommend movies based on other movies - similar movies are next to each other in the embedding space.
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm


In [6]:
# Load our dataset - wikipedia movies links
with open('/content/gdrive/MyDrive/ml/RecommendationSystem/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

In [7]:
# Examining our dataset
print('type: ',type(movies))
movies[0]

type:  <class 'list'>


['Deadpool (film)',
 {'Software Used': 'Adobe Premier Pro',
  'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
  'budget': '$58 million',
  'caption': 'Theatrical release poster',
  'cinematography': 'Ken Seng',
  'country': 'United States',
  'director': 'Tim Miller',
  'distributor': '20th Century Fox',
  'editing': 'Julian Clarke',
  'gross': '$783.1 million',
  'image': 'Deadpool poster.jpg',
  'language': 'English',
  'music': 'Tom Holkenborg',
  'name': 'Deadpool',
  'runtime': '108 minutes'},
 ['Tim Miller (director)',
  'Simon Kinberg',
  'Ryan Reynolds',
  'Lauren Shuler Donner',
  'Rhett Reese',
  'Paul Wernick',
  'Deadpool',
  'Fabian Nicieza',
  'Rob Liefeld',
  'Morena Baccarin',
  'Ed Skrein',
  'T.J. Miller',
  'Gina Carano',
  'Leslie Uggams',
  'Brianna Hildebrand',
  'Stefan Kapičić',
  'Junkie

In [8]:
# movie name
movies[0][0]

'Deadpool (film)'

In [9]:
# movie metadata
movies[0][1]

{'Software Used': 'Adobe Premier Pro',
 'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
 'budget': '$58 million',
 'caption': 'Theatrical release poster',
 'cinematography': 'Ken Seng',
 'country': 'United States',
 'director': 'Tim Miller',
 'distributor': '20th Century Fox',
 'editing': 'Julian Clarke',
 'gross': '$783.1 million',
 'image': 'Deadpool poster.jpg',
 'language': 'English',
 'music': 'Tom Holkenborg',
 'name': 'Deadpool',
 'runtime': '108 minutes'}

In [10]:
# movie links
movies[0][2]

['Tim Miller (director)',
 'Simon Kinberg',
 'Ryan Reynolds',
 'Lauren Shuler Donner',
 'Rhett Reese',
 'Paul Wernick',
 'Deadpool',
 'Fabian Nicieza',
 'Rob Liefeld',
 'Morena Baccarin',
 'Ed Skrein',
 'T.J. Miller',
 'Gina Carano',
 'Leslie Uggams',
 'Brianna Hildebrand',
 'Stefan Kapičić',
 'Junkie XL',
 'Julian Clarke',
 'Marvel Entertainment',
 'Kinberg Genre',
 'Lauren Shuler Donner',
 'TSG Entertainment',
 '20th Century Fox',
 'Le Grand Rex',
 'Variety (magazine)',
 'Box Office Mojo',
 'superhero film',
 'Tim Miller (director)',
 'Rhett Reese',
 'Paul Wernick',
 'Marvel Comics',
 'Deadpool',
 'X-Men (film series)',
 'Ryan Reynolds',
 'Morena Baccarin',
 'Ed Skrein',
 'T.J. Miller',
 'Gina Carano',
 'Leslie Uggams',
 'Brianna Hildebrand',
 'Stefan Kapičić',
 'antihero',
 'New Line Cinema',
 '20th Century Fox',
 'X-Men Origins: Wolverine',
 'principal photography',
 'Vancouver',
 'IMAX',
 'Digital Light Processing',
 'D-Box Technologies',
 'List of accolades received by Deadpool (

In [11]:
# Show the most common link counts
link_counts = Counter()

for movie in movies:
    link_counts.update(movie[2])

link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

In [12]:
# Prepare data for supervised machine learning task
top_links = [link for link, c in link_counts.items() if c >= 3]
print(len(top_links))
top_links[:20]

# Map link to ID number
link_to_idx = {link: idx for idx, link in enumerate(top_links)}

# Map movie to ID Number
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
movie_to_idx

66913


{'Deadpool (film)': 0,
 'The Revenant (2015 film)': 1,
 'Suicide Squad (film)': 2,
 'Spectre (2015 film)': 3,
 'Rebel Without a Cause': 4,
 'Warcraft (film)': 5,
 'The Martian (film)': 6,
 'List of Marvel Cinematic Universe films': 7,
 'X-Men (film series)': 8,
 'The Hateful Eight': 9,
 'The Jungle Book (2016 film)': 10,
 'The Big Short (film)': 11,
 '10 Cloverfield Lane': 12,
 'Spotlight (film)': 13,
 'Room (2015 film)': 14,
 'Creed (film)': 15,
 'DC Universe Animated Original Movies': 16,
 'Star Trek Beyond': 17,
 'Star Wars (film)': 18,
 'Interstellar (film)': 19,
 'Ant-Man (film)': 20,
 'Everest (2015 film)': 21,
 'Jurassic World': 22,
 'Joy (film)': 23,
 'Gods of Egypt (film)': 24,
 'Star Wars sequel trilogy': 25,
 'The Conjuring 2': 26,
 'The Danish Girl (film)': 27,
 'Sicario (2015 film)': 28,
 'Rogue One': 29,
 'Finding Dory': 30,
 'Black Mass (film)': 31,
 'Blade Runner': 32,
 'Harry Potter (film series)': 33,
 'Doctor Strange (film)': 34,
 'Titanic (1997 film)': 35,
 'Furious

In [13]:
# Given a pair (movie, link), we want the neural network to learn to predict whether this is a legitimate pair
# for each movie, we'll iterate through the wikilinks on the movie page and record the movie title and each link as a tuple. 
# The final pairs list will consist of tuples of every (movie, link) pairing on all of Wikipedia.
# Create a blank array
pairs = []

for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)

pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)
pairs_set

{(2454, 7796),
 (21670, 416),
 (2926, 6061),
 (4849, 255),
 (35937, 8392),
 (23746, 1366),
 (18849, 2311),
 (13688, 7765),
 (19206, 2373),
 (178, 2893),
 (11820, 8449),
 (42710, 5809),
 (965, 7725),
 (22, 1106),
 (1203, 8556),
 (33113, 1909),
 (43079, 8950),
 (321, 5887),
 (61712, 5361),
 (15937, 216),
 (1931, 369),
 (43991, 3682),
 (2988, 836),
 (402, 211),
 (13348, 3766),
 (23548, 3035),
 (433, 5722),
 (7609, 5065),
 (700, 4275),
 (22, 4515),
 (2966, 2139),
 (27551, 570),
 (943, 65),
 (22, 8142),
 (19893, 2249),
 (22873, 7291),
 (16570, 2666),
 (22698, 4247),
 (9868, 8215),
 (22, 9301),
 (24719, 4337),
 (56790, 4089),
 (24937, 481),
 (862, 6414),
 (55409, 3392),
 (51124, 6238),
 (4588, 6739),
 (25841, 2840),
 (1591, 3611),
 (11202, 2683),
 (66068, 8593),
 (60643, 5100),
 (10865, 5926),
 (690, 66),
 (53488, 9161),
 (35748, 996),
 (18468, 7543),
 (215, 2246),
 (7534, 6429),
 (19506, 310),
 (57410, 7048),
 (7381, 1666),
 (8904, 1043),
 (37124, 8908),
 (22144, 389),
 (31043, 1277),
 (464

#Build Neural Network & Train Model

In [14]:
def movie_embedding_model(embedding_size=50):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 link (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 movie (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 link_embedding (Embedding)     (None, 1, 50)        3345650     ['link[0][0]']                   
                                                                                                  
 movie_embedding (Embedding)    (None, 1, 50)        500000      ['movie[0][0]']                  
                                                                                              

In [15]:
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=10):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([20558., 31254., 32318., 22418.,  3801.,  1313., 32643., 48731.,
         13365.]),
  'movie': array([ 849., 5530., 7685., 1529., 5874., 7236., 7628., 1854., 6238.])},
 array([-1.,  1., -1.,  1., -1.,  1., -1., -1., -1.]))

In [16]:
# Train Model
positive_samples_per_batch = 512

model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=10),
    epochs=5,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=1
)

Epoch 1/5


  


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f83e0a3b990>

In [17]:
# Save our model
model.save('movie_embeddings.h5')

# Find Most Similar Movies

#### Now that we have the embeddings, we can use them to recommend movies that our model has learned are most similar to a given movie.
<br>


The function below takes in either a movie or a link, a set of embeddings, and returns the `n` most similar items to the query. It does this by computing the dot product between the query and embeddings. Because we normalized the embeddings, the dot product represents the [cosine similarity](http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/) between two vectors. 
<div>
<img src="https://assets.datacamp.com/production/repositories/4375/datasets/c71ee9689f2a54b5273988240e8f198c8ff71f32/cosine_sim.png" width="400"/>
</div>

Once we have the dot products, we can sort the results to find the closest entities in the embedding space. With cosine similarity, higher numbers indicate entities that are closer together, with -1 the furthest apart and +1 closest together.

In [20]:
# Get the Movie embeddings layers 
movie = model.get_layer('movie_embedding')

# Get the weights 
movie_weights = movie.get_weights()[0]

# Get the matrix normal of movie_weights 
movie_lengths = np.linalg.norm(movie_weights, axis=1)

# Get the normalized movie embeddigns 
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie, normalized_movies):
    
    # This represents the cosine similarity of the two vectors
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])

    # Sort dot product results to get the most similar movies
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies("Interstellar (film)", normalized_movies)

19 Interstellar (film) 0.9999999
85 Inception 0.9825766
101 Prometheus (2012 film) 0.9711494
6 The Martian (film) 0.9691768
181 Pacific Rim (film) 0.9676412
22 Jurassic World 0.96634257
784 Spider-Man 2 0.9628183
37 Avatar (2009 film) 0.96267784
182 The Amazing Spider-Man 2 0.9584531
29 Rogue One 0.95834017


In [19]:
similar_movies("Titanic (1997 film)", normalized_movies)

35 Titanic (1997 film) 1.0000001
84 Saving Private Ryan 0.942979
85 Inception 0.9202326
972 Big Fish 0.91959006
303 Raiders of the Lost Ark 0.9158712
1085 Planet of the Apes (2001 film) 0.9144706
321 Charlie and the Chocolate Factory (film) 0.9129013
531 Indiana Jones and the Kingdom of the Crystal Skull 0.90984404
245 Gravity (film) 0.9077564
155 Gladiator (2000 film) 0.90726584


# Find Most Similar Wikipedia links
We also have the embeddings of wikipedia links (which are themselves Wikipedia pages). We can take a similar approach to extract these and find the most similar to a query page.

In [21]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('George Lucas')

127 George Lucas 1.0
3176 Star Wars (film) 0.96305823
3696 Raiders of the Lost Ark 0.95845854
2778 Lucasfilm 0.94576746
2919 Close Encounters of the Third Kind 0.9295887
8301 Academy Award for Visual Effects 0.9214847
2884 London Symphony Orchestra 0.9202094
976 Hugo Award for Best Dramatic Presentation 0.91857505
2810 film treatment 0.9152058
2984 Saturn Award for Best Science Fiction Film 0.9124589
