In [36]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm

In [37]:
print(svm)

<module 'sklearn.svm' from 'c:\\users\\kai ji\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\sklearn\\svm\\__init__.py'>


In [38]:
with open('./wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]
movies = movies[:500]

In [39]:
link_counts = Counter()
count = 0
for movie in movies:
    link_counts.update(movie[2])



10 most common data

In [40]:
link_counts.most_common(10)
top_links = [link for link, c in link_counts.items() if c >= 3]

the top_links extracts all the links from the Counter object

In [41]:
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

(92427, 10281, 500)

In [42]:
movie_to_idx

{'Deadpool (film)': 0,
 'The Revenant (2015 film)': 1,
 'Suicide Squad (film)': 2,
 'Spectre (2015 film)': 3,
 'Rebel Without a Cause': 4,
 'Warcraft (film)': 5,
 'The Martian (film)': 6,
 'List of Marvel Cinematic Universe films': 7,
 'X-Men (film series)': 8,
 'The Hateful Eight': 9,
 'The Jungle Book (2016 film)': 10,
 'The Big Short (film)': 11,
 '10 Cloverfield Lane': 12,
 'Spotlight (film)': 13,
 'Room (2015 film)': 14,
 'Creed (film)': 15,
 'DC Universe Animated Original Movies': 16,
 'Star Trek Beyond': 17,
 'Star Wars (film)': 18,
 'Interstellar (film)': 19,
 'Ant-Man (film)': 20,
 'Everest (2015 film)': 21,
 'Jurassic World': 22,
 'Joy (film)': 23,
 'Gods of Egypt (film)': 24,
 'Star Wars sequel trilogy': 25,
 'The Conjuring 2': 26,
 'The Danish Girl (film)': 27,
 'Sicario (2015 film)': 28,
 'Rogue One': 29,
 'Finding Dory': 30,
 'Black Mass (film)': 31,
 'Blade Runner': 32,
 'Harry Potter (film series)': 33,
 'Doctor Strange (film)': 34,
 'Titanic (1997 film)': 35,
 'Furious

The function below will add embedding with the size of 50 to the layer.
Think of Embedding as a number that is squeezed into lower dimensions from higher dimension data. We will use embedding to compute the similarity of movies

In [43]:
def movie_embedding_model(embedding_size=50):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

In [44]:

model = movie_embedding_model()
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
link (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 50)        514050      link[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 50)        25000       movie[0][0]                      
____________________________________________________________________________________________

In [45]:
def batchifier(pairs, positive_samples=50, negative_ratio=10):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]
        
next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([ 5676.,  4561.,  1977.,  1645.,  3784.,   311., 10059.,   453.,
          4649.]),
  'movie': array([449., 182., 207., 346., 135., 479., 420.,  31.,  98.])},
 array([-1.,  1., -1., -1., -1.,  1., -1.,  1., -1.]))

In [46]:
positive_samples_per_batch = 512

model.fit(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=10),
    epochs=15,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

Epoch 1/15
180/180 - 5s - loss: 0.7522
Epoch 2/15
180/180 - 5s - loss: 0.2787
Epoch 3/15
180/180 - 5s - loss: 0.2555
Epoch 4/15
180/180 - 5s - loss: 0.2459
Epoch 5/15
180/180 - 5s - loss: 0.2397
Epoch 6/15
180/180 - 6s - loss: 0.2373
Epoch 7/15
180/180 - 6s - loss: 0.2351
Epoch 8/15
180/180 - 5s - loss: 0.2341
Epoch 9/15
180/180 - 5s - loss: 0.2337
Epoch 10/15
180/180 - 5s - loss: 0.2333
Epoch 11/15
180/180 - 5s - loss: 0.2332
Epoch 12/15
180/180 - 5s - loss: 0.2332
Epoch 13/15
180/180 - 6s - loss: 0.2330
Epoch 14/15
180/180 - 5s - loss: 0.2320
Epoch 15/15
180/180 - 5s - loss: 0.2327


<keras.callbacks.History at 0x250dc9062b0>

In [47]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

29 Rogue One 1.0
25 Star Wars sequel trilogy 0.93790436
44 Return of the Jedi 0.916988
74 The Empire Strikes Back 0.9077035
421 Spaceballs 0.87386966
311 E.T. the Extra-Terrestrial 0.8565561
254 Passengers (2016 film) 0.8536506
22 Jurassic World 0.848452
86 Tomorrowland (film) 0.84841776
180 Labyrinth (film) 0.84579283


In [50]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('George Lucas')

112 George Lucas 1.0000001
2085 Lucasfilm 0.96520656
2016 Harrison Ford 0.9515881
1912 Mark Hamill 0.92811567
2370 Star Wars (film) 0.9277312
1250 VHS 0.9248808
2040 Jaws (film) 0.9211479
2027 Star Wars 0.9205619
2060 Chewbacca 0.9172408
2127 Elstree Studios 0.9139655
