In [1]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Input, Reshape, Dense
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Using TensorFlow backend.


In [2]:
books = []

with open('/data/books.ndjson') as fin:
    for line in fin.readlines():
        # Use loads because this is an object
        books.append(json.loads(line))
        
len(books)

36552

In [3]:
link_counts = Counter()

for book in books:
    link_counts.update(book[2])
    
link_counts.most_common(10)

[('Hardcover', 7603),
 ('Paperback', 7451),
 ('Wikipedia:WikiProject Books', 6081),
 ('Wikipedia:WikiProject Novels', 6053),
 ('English language', 4259),
 ('The New York Times', 3909),
 ('United States', 3366),
 ('Science fiction', 3105),
 ('science fiction', 2648),
 ('Publishers Weekly', 2350)]

In [4]:
books[10][0]

'Blade Runner 2: The Edge of Human'

In [5]:
books[10][1]

{'name': 'Blade Runner 2: < br > The Edge of Human',
 'author': 'K. W. Jeter',
 'language': 'English',
 'country': 'United States',
 'genre': 'Science fiction',
 'isbn': '0-553-09979-5',
 '1': '< !-- See Wikipedia:WikiProject_Novels or Wikipedia:WikiProject_Books -- >',
 'image': 'File:Blade Runner 2 The Edge of Human KW Jeter cover.jpg',
 'series': 'Blade Runner',
 'release_date': 'October 1, 1995',
 'media_type': 'Print (Hardcover, Paperback)',
 'pages': '340',
 'dewey': '813/.54 20',
 'congress': 'PS3560.E85 B58 1995',
 'oclc': '32548543',
 'preceded_by': 'Do Androids Dream of Electric Sheep?',
 'followed_by': 'Replicant Night',
 'caption': 'Cover of the first edition'}

In [6]:
top_links = [link for link, c in link_counts.items() if c >= 5]

link_to_idx = {link: idx for idx, link in enumerate(top_links)}
idx_to_link = {idx: link for link, idx in link_to_idx.items()}

book_to_idx = {book[0]: idx for idx, book in enumerate(books)}
idx_to_book = {idx: book for book, idx in book_to_idx.items()}

print(f'There are {len(top_links)} links with more than 4 occurrences.')
print(f'There are {len(book_to_idx)} books.')

There are 38357 links with more than 4 occurrences.
There are 36552 books.


In [None]:
pairs = []
for book in books:
    pairs.extend((link_to_idx[link], 
                  book_to_idx[book[0]]) for link in book[2] if link in top_links)
    
len(pairs)
pairs[-10]

pairs_set = set(pairs)
len(pairs_set)

In [None]:
pairs[1500]
idx_to_link[pairs[1500][0]]
idx_to_book[pairs[1500][1]]

In [None]:
pairs[2100]
idx_to_link[pairs[2100][0]]
idx_to_book[pairs[2100][1]]

In [None]:
def book_embedding_model(embedding_size=50):
    # Each input is (batch_size, 1)
    link = Input(name = 'link', shape = [1])
    book = Input(name = 'book', shape = [1])
    
    # Embed the link
    link_embedding = Embedding(name = 'link_embedding',
                               input_dim = len(top_links),
                               output_dim = embedding_size)(link)
    
    # Reshape to be (batch_size, embedding_size)
    link_embedding = Reshape([embedding_size])(link_embedding)
    
    # Embed the book
    book_embedding = Embedding(name = 'book_embedding',
                               input_dim = len(book_to_idx),
                               output_dim = embedding_size)(book)
    
    # Reshape to be (batch_size, embedding_size)
    book_embedding = Reshape([embedding_size])(book_embedding)
    
    # Take the dot product of the second axes (the embedding)
    dot = Dot(name = 'dot_product', normalize = True, 
              axes = 1)([link_embedding, book_embedding])
    output = Dense(1, activation = 'sigmoid', name = 'output')(dot)
    # Create the model and compile
    model = Model(inputs = [link, book], outputs = [output])
    model.compile(optimizer='adam', 
                  loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = book_embedding_model()
model.summary()

In [None]:
def batcher(pairs, num_pos=1, neg_ratio=3):
    """Generate batches of positive and negative samples"""
    batch_size = num_pos * (1 + neg_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        # Sample random positive samples
        for idx, (link_id, book_id) in enumerate(random.sample(pairs, num_pos)):
            batch[idx, :] = (link_id, book_id, 1)
            
        idx += 1
        while idx < batch_size:
    
            # Sample a random movie and a random link
            book_id = random.randrange(len(book_to_idx))
            link_id = random.randrange(len(top_links))
            
            # Check to make sure this is not a positive example
            if (book_id, link_id) not in pairs_set:
                # This is a negative sample
                batch[idx, :] = (book_id, link_id, 0)
                idx += 1
                
        np.random.shuffle(batch)
        
        # Inputs to model and output
        yield {'link': batch[:, 0], 'book': batch[:, 1]}, batch[:, 2]

In [None]:
next(batcher(pairs, num_pos = 5, neg_ratio = 4))

In [None]:
num_pos_samples = 256

train_gen = batcher(pairs, num_pos_samples, 4)
valid_gen = batcher(pairs, num_pos_samples, 4)

model.fit_generator(
        # Set the generator
        train_gen,
        epochs = 15,
        # Batches per epoch
        steps_per_epoch = len(pairs) // num_pos_samples,
        validation_data = valid_gen,
        # Validation batches per epoch
        validation_steps = (len(pairs) // num_pos_samples) // 10,
        verbose = 2)


In [None]:
book_link_counts = list(map(lambda x: (x[0], len(x[2])), books))

In [None]:
book_link_counts[:10]