This notebook uses best model conf from  grid search and evaluates it

In [1]:
# best so far
params = {'input_shape': [41], 'l1_shape': 96, 'l2_shape': 160, 'l3_shape': 384, 'l4_shape': 512, 'd1_rate': 0.0, 'd2_rate': 0.25, 'distance': 'l1'}


In [2]:
from time import time
from pickle import load

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, StratifiedKFold
from numpy.random import seed
seed(0)

from keras.models import Model
from keras.layers import Input, Dense, Dropout, Lambda
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping
from keras import backend as K
from keras.utils.vis_utils import plot_model
from tensorflow import set_random_seed
set_random_seed(0)

Using TensorFlow backend.


In [3]:
df = pd.read_csv('./dataset.csv', index_col=0)
df.head()

Unnamed: 0,color,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,...,Music,Romance,Documentary,Musical,Mystery,Horror,History,Animation,Film-Noir,Sport
Avatar,1.0,0.889163,0.529412,0.0,0.037174,0.001563,1.0,0.524453,0.007361,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pirates of the Caribbean: At World's End,1.0,0.37069,0.501548,0.024478,0.043478,0.0625,0.40684,0.278865,0.073622,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spectre,1.0,0.740148,0.436533,0.0,0.007,0.017188,0.26308,0.163256,0.017816,0.023256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Dark Knight Rises,1.0,1.0,0.486068,0.956522,1.0,0.042188,0.589253,0.677216,0.162561,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
John Carter,1.0,0.567734,0.386997,0.020652,0.023043,0.001,0.096066,0.125579,0.002852,0.023256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# load dict with similar/dissimilar movies
with open('./sims.pkl', 'rb') as f:
    movie_sims = load(f)

In [5]:
def create_pairs(movie_sims, df, k):
    """Creates positive/negative pairs for one-shot learning"""
    pairs = []
    labels = []

    for movie in movie_sims:
        # get vector for particular movie
        movie_vec = df.loc[movie].values
        # get vectors of its similar/dissimilar movies
        p_vec_l = [df.loc[movie].values for movie in movie_sims[movie]['pos']]
        n_vec_l =[df.loc[movie].values for movie in movie_sims[movie]['neg']]
        # construct pairs
        for pos, neg in zip(p_vec_l, n_vec_l):
            pairs += [[movie_vec, pos]]
            pairs += [[movie_vec, neg]]
            labels += [0, 1]
    
    folds = list(StratifiedKFold(n_splits=k, shuffle=True, random_state=0).split(pairs, labels))
    
    return folds, np.array(pairs), np.array(labels)

In [6]:
print('Creating pairs ...')
k = 10
folds, x_train, y_train = create_pairs(movie_sims, df, k)
print('Pairs created!')

Creating pairs ...
Pairs created!


In [7]:
# loss function
def margin_loss(y_true, y_pred):
    m = 1
    loss = 0.5*(1-y_true)*y_pred + 0.5*y_true*K.maximum(0.0, m - y_pred)
    return loss

def compute_accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on distances.'''
    pred = y_pred.ravel() > 0.5
    return np.mean(pred == y_true)

def build_model(input_shape,
               l1_shape,
               l2_shape,
               l3_shape,
               l4_shape,
               d1_rate,
               d2_rate,
               distance):
    def build_base_network(input_shape,
                          l1_shape,
                          l2_shape,
                          l3_shape,
                          l4_shape,
                          d1_rate,
                          d2_rate):
        i = Input(shape=input_shape)
        x = Dense(l1_shape, activation='relu')(i)
        if d1_rate:
            x = Dropout(d1_rate)(x)
        x = Dense(l2_shape, activation='relu')(x)
        if d2_rate:
            x = Dropout(d2_rate)(x)
        x = Dense(l3_shape, activation='relu')(x)
        x = Dense(l4_shape, activation='sigmoid')(x)
        return Model(i, x)
    
    base_network = build_base_network(input_shape,
                                     l1_shape,
                                     l2_shape,
                                     l3_shape,
                                     l4_shape,
                                     d1_rate,
                                     d2_rate)
    
    input_a = Input(shape=input_shape)
    input_b = Input(shape=input_shape)

    processed_a = base_network(input_a)
    processed_b = base_network(input_b)

    if distance == 'l1':
        d =  lambda x: K.abs(x[0] - x[1])
    elif distance == 'l2':
        d = lambda x: K.sqrt(K.square(x[0] - x[1]))
    else:
        raise Exception('bad dist')
    
    dist = Lambda(d, 
                  output_shape=lambda x: x[0])([processed_a, processed_b])
    pred = Dense(1, activation='sigmoid')(dist)
    model = Model(input=[input_a, input_b], outputs=[pred])
    model.compile(loss=[margin_loss], optimizer=RMSprop(), metrics=['accuracy'])
    return model

In [8]:
scores = []
for j, (train_idx, val_idx) in enumerate(folds):
    print(f'\nFold {j}')
    x_train_cv = x_train[train_idx]
    y_train_cv = y_train[train_idx]
    x_valid_cv = x_train[val_idx]
    y_valid_cv = y_train[val_idx]
    
    model = build_model(**params)
    epochs = 30
    rms = RMSprop()
    early_stopping = EarlyStopping(patience=5, restore_best_weights=True)
    history = model.fit([x_train_cv[:, 0], x_train_cv[:, 1]], y_train_cv,
                          batch_size=128,
                          epochs=epochs,
                          validation_data=([x_valid_cv[:, 0], x_valid_cv[:, 1]], y_valid_cv), callbacks=[early_stopping], verbose=0)
    y_pred = model.predict([x_valid_cv[:, 0], x_valid_cv[:, 1]])
    te_acc = compute_accuracy(y_valid_cv, y_pred)
    print(f'Accuracy: {te_acc}')
    scores.append(te_acc)

print(f'Avg accuracy: {sum(scores)/len(scores)}')
    


Fold 0


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/kuba/anaconda3/envs/lsh_tf36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-da02e3d4f690>", line 16, in <module>
    validation_data=([x_valid_cv[:, 0], x_valid_cv[:, 1]], y_valid_cv), callbacks=[early_stopping], verbose=0)
  File "/home/kuba/anaconda3/envs/lsh_tf36/lib/python3.6/site-packages/keras/engine/training.py", line 1039, in fit
    validation_steps=validation_steps)
  File "/home/kuba/anaconda3/envs/lsh_tf36/lib/python3.6/site-packages/keras/engine/training_arrays.py", line 199, in fit_loop
    outs = f(ins_batch)
  File "/home/kuba/anaconda3/envs/lsh_tf36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2715, in __call__
    return self._call(inputs)
  File "/home/kuba/anaconda3/envs/lsh_tf36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2675, in _call
    fetched 

KeyboardInterrupt: 