In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve
from sklearn.neighbors import KNeighborsClassifier
import random
import matplotlib.patheffects as PathEffects

from sklearn.metrics.pairwise import pairwise_distances

from random import sample

In [2]:
import tensorflow as tf

In [3]:
from keras.layers import Input, Conv2D, Lambda, Dense, Flatten,MaxPooling2D, concatenate

from keras.models import Model, Sequential
from keras.regularizers import l2
from keras import backend as K
from keras.optimizers import SGD,Adam
from keras.losses import binary_crossentropy
import os
import pickle
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [4]:
from itertools import permutations
import seaborn as sns

from keras.datasets import mnist
from sklearn.manifold import TSNE

from sklearn.svm import SVC

In [5]:
import spacy
import en_core_web_sm

import re

from sklearn.decomposition import PCA

'ratings.csv' dataset consists of 
- the user who rated the movie (userID)
- the movie is rated (movieID)
- the rating given by the user for that particular movie (rating)
- the time at which the rating was recorded (timestamp)

In [6]:
df_ratings = pd.read_csv('ml-25m/ratings.csv')
df_ratings.shape

(25000095, 4)

'movies.csv' dataset consists of
- the movie id (movieID)
- the movie title (title)
- the genres (genres)

In [7]:
df_movie = pd.read_csv('ml-25m/movies.csv')
df_movie.shape

(62423, 3)

In [8]:
nlp = en_core_web_sm.load()

# import the list of stop words from the spacy library
from spacy.lang.en.stop_words import STOP_WORDS

def remove_stop_words(text):
    return ' '.join([word for word in text.split(' ') if word.lower() not in STOP_WORDS])

print(remove_stop_words('why is my dog on the drugs'))


dog drugs


In [9]:
## Genres:

# Break up the big genre string into a string array
df_movie['genres'] = df_movie['genres'].str.split('|')

# Convert genres to string value
df_movie['genres'] = df_movie['genres'].fillna("")

In [10]:
## Title:

# regular expression to extract year and title
p1 = re.compile(r'[(](.*?)[)]', re.S)

df_movie['title_l'] = df_movie['title'].apply(lambda x: re.findall(r"[\w']+", x))

In [11]:
def get_word_vec(l):
    return nlp(remove_stop_words(' '.join(l))).vector

# Combine movie and rating datasets
## Sample a small dataset of ratings

In [12]:
small_data = df_ratings.sample(frac = 0.001)
small_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 547145 to 16285937
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     25000 non-null  int64  
 1   movieId    25000 non-null  int64  
 2   rating     25000 non-null  float64
 3   timestamp  25000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 976.6 KB


In [13]:
small_data = pd.merge(small_data, df_movie, how = 'left', on = 'movieId')
print(small_data.shape)

print(len(set(small_data['movieId'])), len(set(small_data['userId'])))

(25000, 7)
5762 20078


In [14]:
small_data.drop(['timestamp'], inplace = True, axis = 1)

small_data.head()

Unnamed: 0,userId,movieId,rating,title,genres,title_l
0,3754,3478,4.0,"Bamba, La (1987)",[Drama],"[Bamba, La, 1987]"
1,97290,356,5.0,Forrest Gump (1994),"[Comedy, Drama, Romance, War]","[Forrest, Gump, 1994]"
2,40846,3967,4.0,Billy Elliot (2000),[Drama],"[Billy, Elliot, 2000]"
3,155034,1197,5.0,"Princess Bride, The (1987)","[Action, Adventure, Comedy, Fantasy, Romance]","[Princess, Bride, The, 1987]"
4,91517,1222,5.0,Full Metal Jacket (1987),"[Drama, War]","[Full, Metal, Jacket, 1987]"


In [15]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(small_data, test_size=0.2)

print(train_data.shape, test_data.shape)

(20000, 6) (5000, 6)


In [16]:
train_movie_data = train_data[['movieId', 'title', 'title_l', 'genres']].drop_duplicates('movieId')
train_movie_data = train_movie_data.sort_values(by = 'movieId').reset_index(drop = True)

print(train_movie_data.shape)

test_movie_data = test_data[['movieId', 'title', 'title_l', 'genres']].drop_duplicates('movieId')
test_movie_data = test_movie_data.sort_values(by = 'movieId').reset_index(drop = True)

print(test_movie_data.shape)

(5211, 4)
(2462, 4)


Individual movie content feature: title and genres, two word vectors

In [28]:
train_movie_title_vec = pd.DataFrame(0, index=train_movie_data['movieId'], 
                                   columns=range(96))
train_movie_genre_vec = pd.DataFrame(0, index=train_movie_data['movieId'], 
                                   columns=range(96))

test_movie_title_vec = pd.DataFrame(0, index=test_movie_data['movieId'], 
                                   columns=range(96))
test_movie_genre_vec = pd.DataFrame(0, index=test_movie_data['movieId'], 
                                   columns=range(96))

print(train_movie_title_vec.shape, test_movie_title_vec.shape)

# Training
for i in range(train_movie_data.shape[0]):
    train_movie_title_vec.loc[train_movie_data['movieId'][i], :] = pd.Series(get_word_vec(train_movie_data['title_l'][i]))
    train_movie_genre_vec.loc[train_movie_data['movieId'][i], :] = pd.Series(get_word_vec(train_movie_data['genres'][i]))
    
# Testing
for i in range(test_movie_data.shape[0]):
    test_movie_title_vec.loc[test_movie_data['movieId'][i], :] = pd.Series(get_word_vec(test_movie_data['title_l'][i]))
    test_movie_genre_vec.loc[test_movie_data['movieId'][i], :] = pd.Series(get_word_vec(test_movie_data['genres'][i]))


(5211, 96) (2462, 96)


In [29]:
train_movie_title_vec.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-2.355089,0.991829,0.153801,-0.195376,-0.939769,1.062713,-0.087003,0.965813,0.811997,0.087335,...,-1.589762,-0.810004,2.363398,1.016425,-1.534584,1.296966,0.531302,-3.913785,-1.275919,-0.685872
2,-1.777861,0.856493,-1.002494,0.656409,-0.826132,1.198098,-0.002911,-1.111856,0.393813,-0.333179,...,-1.41574,0.753537,1.89915,1.297991,-2.89357,1.988402,-0.386679,-2.99033,-0.755991,-0.047538
3,-0.28788,1.461926,0.178619,-0.025045,-1.432144,2.035658,-1.72665,-2.118157,2.140284,0.017525,...,-1.32566,-0.519488,1.25651,1.914781,-1.083571,1.464755,1.668898,-3.705204,-1.100169,-0.093394
4,-0.788493,1.661333,-1.323206,-0.349413,-1.822633,0.092523,-0.922329,-1.570007,0.695472,2.254215,...,-0.498176,-1.35496,0.883165,1.006218,-1.923185,-0.453417,-0.569848,-1.409097,-1.022384,-0.884098
5,-1.674872,1.723574,0.676185,-0.567428,-1.522431,0.200554,-1.261009,0.501906,0.974284,-1.412346,...,-0.83555,-1.15407,1.955808,1.720978,-1.634962,1.351146,0.846751,-2.949474,-0.592519,-0.887223


In [35]:
train_movie_data_vec = (train_movie_title_vec + train_movie_genre_vec)/2
test_movie_data_vec = (test_movie_title_vec + test_movie_genre_vec)/2

Calculate the similarity matrix based on the content features

In [36]:
movie_correlation = 1 - pairwise_distances(train_movie_data_vec, metric = 'correlation')
movie_correlation[np.isnan(movie_correlation)] = 0
print(movie_correlation.shape)

(5211, 5211)


In [37]:
movie_correlation[:4, :4]

array([[1.        , 0.85711685, 0.8821668 , 0.84976041],
       [0.85711685, 1.        , 0.75203811, 0.79207242],
       [0.8821668 , 0.75203811, 1.        , 0.86057745],
       [0.84976041, 0.79207242, 0.86057745, 1.        ]])

# Construct co-watched/rated graph for movies
An edge exists between two movies if many users rated both movies

In [38]:
train_user_data = train_data[['userId', 'movieId']]
train_user_data = train_user_data.sort_values(by = 'movieId').reset_index(drop = True)
train_user_data.head()

Unnamed: 0,userId,movieId
0,62216,1
1,60962,1
2,115090,1
3,96463,1
4,16367,1


In [39]:
len(set(train_user_data['movieId'])), train_user_data.shape[0]

(5211, 20000)

Construct a user dictionary where the key is the movieId and the values are the userIds who rated this movie

In [40]:
d_user = {}
# 
for i in range(train_user_data.shape[0]):
    if train_user_data['movieId'][i] not in d_user:
        d_user[train_user_data['movieId'][i]] = []
    else:
        d_user[train_user_data['movieId'][i]].append(train_user_data['userId'][i])

print(len(d_user))

5211


Create movie-to-movie user co-watched (rated) matrix
- cell_{i, j}: # of users who watched/rated both movie i and j

In [41]:
movie_cowatched = np.zeros((len(train_movie_data['movieId']), len(train_movie_data['movieId'])))

for r in range(movie_cowatched.shape[0]):
    for c in range(r+1, movie_cowatched.shape[1]):
        movie_cowatched[r, c] = len(set(d_user[train_movie_data['movieId'][r]]\
                                ).intersection(set(d_user[ train_movie_data['movieId'][c]])))

# Symmetric matrix for co-watched movie matrix
movie_cowatched.T.sum(), movie_cowatched.sum()
movie_cowatched_s = movie_cowatched + movie_cowatched.T

print(movie_cowatched_s.shape)

(5211, 5211)


In [42]:
movie_cowatched_s[:4, :4]

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [43]:
movie_cowatched.sum(), movie_cowatched.shape

(1877.0, (5211, 5211))

# CDML: Triplet NN

### Get a list of all movie pairs with # of cowatched > 0

In [44]:
cowatched_list = []
# Getting the list of all pairs with # of cowatched > 0
for r in range(movie_cowatched_s.shape[0]):
    for c in range(r+1, movie_cowatched_s.shape[1]):
        if movie_cowatched_s[r, c] > 0:
            cowatched_list.append((r, c))
            
print(len(cowatched_list), cowatched_list[:5])

1873 [(0, 773), (0, 932), (0, 1158), (0, 1608), (0, 1660)]


### Construct a dictionary with key = movieId, and values = set of index who have # of cowatched = 0

In [45]:
zero_cowatched_dict = {}

for r in range(movie_cowatched_s.shape[0]):
    zero_cowatched_dict[r] = set(c for c in range(movie_cowatched_s.shape[0]) \
                              if movie_cowatched_s[r, c] == 0 and r != c)


# Data Preprocessing

In [46]:
x_train_movie_title = train_movie_title_vec.values
x_train_movie_genre = train_movie_genre_vec.values

x_test_movie_title = test_movie_title_vec.values
x_test_movie_genre = test_movie_genre_vec.values

print(x_train_movie_title.shape, x_test_movie_title.shape)

(5211, 96) (2462, 96)


In [47]:
x_train_movie_title[:4, :4]

array([[-2.35508943,  0.9918291 ,  0.15380089, -0.1953755 ],
       [-1.77786136,  0.85649323, -1.00249374,  0.65640938],
       [-0.28787977,  1.46192575,  0.17861867, -0.0250445 ],
       [-0.78849339,  1.66133308, -1.32320559, -0.34941271]])

### Generate triplets

In [52]:

def generate_triplet(x_train_movie, cowatched_list, zero_cowatched_dict, ap_pairs, an_pairs, testsize):
 
    #ap_pairs, an_pairs = 10, 10
    #testsize = 0.2 

    trainsize = 1 - testsize
    triplet_train_pairs = []
    triplet_test_pairs = []

    A_P_pairs = random.sample(cowatched_list, k = ap_pairs)
    Neg_idx = []
    for p in range(len(A_P_pairs)):
        Neg_idx.append(sample(zero_cowatched_dict[A_P_pairs[p][0]].intersection(zero_cowatched_dict[A_P_pairs[p][1]]), 1)[0])

    # Train
    A_P_len = len(A_P_pairs)
    Neg_len = len(Neg_idx)
    train_i = 0
    for ap in A_P_pairs[:int(A_P_len*trainsize)]:
        # print(ap, train_i)
        Anchor = x_train_movie[ap[0]]
        Positive = x_train_movie[ap[1]]
        Negative = x_train_movie[Neg_idx[train_i]]
        triplet_train_pairs.append([Anchor, Positive, Negative])
        train_i += 1

    # Test
    test_i = int(A_P_len*trainsize)
    for ap in A_P_pairs[int(A_P_len*trainsize):]:
        #print(ap, test_i)
        Anchor = x_train_movie[ap[0]]
        Positive = x_train_movie[ap[1]]
        Negative = x_train_movie[Neg_idx[test_i]]
        triplet_test_pairs.append([Anchor, Positive, Negative])
        test_i += 1
    
    return np.array(triplet_train_pairs), np.array(triplet_test_pairs)

In [53]:
X_train_title, X_test_title = generate_triplet(x_train_movie_title, cowatched_list, zero_cowatched_dict, \
                                   ap_pairs=1000, an_pairs=1000,testsize=0.2)

X_train_genre, X_test_genre = generate_triplet(x_train_movie_genre, cowatched_list, zero_cowatched_dict, \
                                   ap_pairs=1000, an_pairs=1000,testsize=0.2)

X_train_title.shape, X_test_title.shape

((800, 3, 96), (200, 3, 96))

## Optimize a ranking Triplet loss

In [54]:
def triplet_loss(y_true, y_pred, alpha = 0.4):
    """
    Implementation of the triplet loss function
    Arguments:
    y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
    y_pred -- python list containing three objects:
            anchor -- the encodings for the anchor data
            positive -- the encodings for the positive data (similar to anchor)
            negative -- the encodings for the negative data (different from anchor)
    Returns:
    loss -- real number, value of the loss
    """
    print('y_pred.shape = ',y_pred)
    
    total_lenght = y_pred.shape.as_list()[-1]
#     print('total_lenght=',  total_lenght)
#     total_lenght =12
    
    anchor = y_pred[:,0:int(total_lenght*1/3)]
    positive = y_pred[:,int(total_lenght*1/3):int(total_lenght*2/3)]
    negative = y_pred[:,int(total_lenght*2/3):int(total_lenght*3/3)]

    # distance between the anchor and the positive
    pos_dist = K.sum(K.square(anchor-positive),axis=1)

    # distance between the anchor and the negative
    neg_dist = K.sum(K.square(anchor-negative),axis=1)

    # compute loss
    basic_loss = pos_dist-neg_dist+alpha
    loss = K.maximum(basic_loss,0.0)
 
    return loss

In [67]:
def create_base_network(in_dims):
    """
    Base network to be shared.
    """
    model = Sequential()
    model.add(Conv2D(128,(7,7),padding='same',input_shape=(in_dims[0],in_dims[1],in_dims[2],),activation='relu',name='conv1'))
    model.add(MaxPooling2D((2,2),(2,2),padding='same',name='pool1'))
    model.add(Conv2D(256,(5,5),padding='same',activation='relu',name='conv2'))
    model.add(MaxPooling2D((2,2),(2,2),padding='same',name='pool2'))
    model.add(Flatten(name='flatten'))
    model.add(Dense(96,name='embeddings')) # No activation on final dense layer
    # model.add(Lambda(lambda x: tf.math.l2_normalize(x, axis = 1))) # L2 normalize embeddings
    # model.add(Dense(600))
    
    return model

In [68]:
adam_optim = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999)

In [69]:
anchor_input = Input((96,1,1,), name='anchor_input')
positive_input = Input((96,1,1,), name='positive_input')
negative_input = Input((96,1,1,), name='negative_input')

# Shared embedding layer for positive and negative items
Shared_DNN = create_base_network([96,1,1,])
# Shared_DNN = create_base_network([12,8,1,])


encoded_anchor = Shared_DNN(anchor_input)
encoded_positive = Shared_DNN(positive_input)
encoded_negative = Shared_DNN(negative_input)

merged_vector = concatenate([encoded_anchor, encoded_positive, encoded_negative], axis=-1, name='merged_layer')

model = Model(inputs=[anchor_input,positive_input, negative_input], outputs=merged_vector)
model.compile(loss=triplet_loss, optimizer=adam_optim)

y_pred.shape =  Tensor("merged_layer_2/concat:0", shape=(None, 288), dtype=float32)


In [70]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
anchor_input (InputLayer)       (None, 96, 1, 1)     0                                            
__________________________________________________________________________________________________
positive_input (InputLayer)     (None, 96, 1, 1)     0                                            
__________________________________________________________________________________________________
negative_input (InputLayer)     (None, 96, 1, 1)     0                                            
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 96)           1415776     anchor_input[0][0]               
                                                                 positive_input[0][0]       

In [71]:
# Time consuming
# (1) Movie title tower

Anchor = X_train_title[:,0,:].reshape(-1,96,1,1)
Positive = X_train_title[:,1,:].reshape(-1,96,1,1)
Negative = X_train_title[:,2,:].reshape(-1,96,1,1)
Anchor_test = X_test_title[:,0,:].reshape(-1,96,1,1)
Positive_test = X_test_title[:,1,:].reshape(-1,96,1,1)
Negative_test = X_test_title[:,2,:].reshape(-1,96,1,1)

Y_dummy = np.empty((Anchor.shape[0],300))
Y_dummy2 = np.empty((Anchor_test.shape[0],1))

model.fit([Anchor,Positive,Negative],y=Y_dummy,validation_data=([Anchor_test,Positive_test,Negative_test],Y_dummy2), batch_size=512, epochs=50)

Train on 800 samples, validate on 200 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x1a64ee6f50>

In [74]:
trained_model1 = Model(inputs=anchor_input, outputs=encoded_anchor)

x_train_title_pred = trained_model1.predict(x_train_movie_title.reshape(-1, 96, 1, 1))
x_test_title_pred = trained_model1.predict(x_test_movie_title.reshape(-1, 96, 1, 1))

print(x_train_title_pred.shape, x_test_title_pred.shape)

# print(len(x_test_title_pred[0]), np.sqrt((x_test_title_pred[0]**2).sum())) # L2-norm

(5211, 96) (2462, 96)


In [75]:
model2 = Model(inputs=[anchor_input,positive_input, negative_input], outputs=merged_vector)
model2.compile(loss=triplet_loss, optimizer=adam_optim)

model2.summary()

y_pred.shape =  Tensor("merged_layer_2/concat:0", shape=(None, 288), dtype=float32)
Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
anchor_input (InputLayer)       (None, 96, 1, 1)     0                                            
__________________________________________________________________________________________________
positive_input (InputLayer)     (None, 96, 1, 1)     0                                            
__________________________________________________________________________________________________
negative_input (InputLayer)     (None, 96, 1, 1)     0                                            
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 96)           1415776     anchor_input[0][0]               
        

In [76]:
# Time consuming
# Movie genre tower
Anchor = X_train_genre[:,0,:].reshape(-1,96,1,1)
Positive = X_train_genre[:,1,:].reshape(-1,96,1,1)
Negative = X_train_genre[:,2,:].reshape(-1,96,1,1)
Anchor_test = X_test_genre[:,0,:].reshape(-1,96,1,1)
Positive_test = X_test_genre[:,1,:].reshape(-1,96,1,1)
Negative_test = X_test_genre[:,2,:].reshape(-1,96,1,1)

Y_dummy = np.empty((Anchor.shape[0],300))
Y_dummy2 = np.empty((Anchor_test.shape[0],1))

model2.fit([Anchor,Positive,Negative],y=Y_dummy,validation_data=([Anchor_test,Positive_test,Negative_test],Y_dummy2), batch_size=512, epochs=50)

Train on 800 samples, validate on 200 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x1a65297810>

In [77]:
trained_model2 = Model(inputs=anchor_input, outputs=encoded_anchor)

x_train_genre_pred = trained_model2.predict(x_train_movie_genre.reshape(-1, 96, 1, 1))
x_test_genre_pred = trained_model2.predict(x_test_movie_genre.reshape(-1, 96, 1, 1))

print(x_train_genre_pred.shape, x_test_genre_pred.shape)

# print(len(x_test_movie_pred[0]), np.sqrt((x_test_movie_pred[0]**2).sum())) # L2-norm

(5211, 96) (2462, 96)


Combine movie title embedding vector and genre embedding vector via element-wise multiplication, then apply L2 normalization

In [81]:
x_train_genre_pred[:4, :4], x_train_title_pred[:4, :4]

(array([[ 0.01920007,  0.02746626,  0.23426566, -0.34349748],
        [ 0.07480142,  0.05476695,  0.16217682, -0.20209655],
        [ 0.23435155,  0.1969838 ,  0.2182913 , -0.04982473],
        [ 0.01382559,  0.11908623,  0.36122996, -0.30534092]],
       dtype=float32),
 array([[ 0.16780746,  0.4363753 , -0.10361842,  0.21908543],
        [-0.03771356,  0.2611343 , -0.17115346,  0.10339521],
        [ 0.03289174,  0.46348557,  0.02858389,  0.2581252 ],
        [ 0.0230694 ,  0.32935804, -0.0991705 ,  0.04588554]],
       dtype=float32))

In [83]:
x_train_title_pred.shape

(5211, 96)

In [102]:
from sklearn.preprocessing import normalize

x_train_movie_pred = x_train_title_pred * x_train_genre_pred
x_train_movie_pred = normalize(x_train_movie_pred, axis = 1, norm = 'l2')
print(len(x_train_movie_pred[0]), np.sqrt((x_train_movie_pred[0]**2).sum())) # L2-norm

96 0.99999994


Calculate similary matrix based on the embedding movie features

In [103]:
movie_correlation_tripletNN = 1 - pairwise_distances(x_train_movie_pred, metric = 'correlation')
movie_correlation_tripletNN[np.isnan(movie_correlation_tripletNN)] = 0

In [104]:
movie_correlation[:4, :4]

array([[1.        , 0.85711685, 0.8821668 , 0.84976041],
       [0.85711685, 1.        , 0.75203811, 0.79207242],
       [0.8821668 , 0.75203811, 1.        , 0.86057745],
       [0.84976041, 0.79207242, 0.86057745, 1.        ]])

In [105]:
movie_correlation_tripletNN[:4, :4]

array([[1.        , 0.77347115, 0.80596001, 0.77832601],
       [0.77347115, 1.        , 0.6199472 , 0.6548749 ],
       [0.80596001, 0.6199472 , 1.        , 0.77751269],
       [0.77832601, 0.6548749 , 0.77751269, 1.        ]])

In [106]:
train_movie_data.head()

Unnamed: 0,movieId,title,title_l,genres
0,1,Toy Story (1995),"[Toy, Story, 1995]","[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Jumanji, 1995]","[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Grumpier, Old, Men, 1995]","[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Waiting, to, Exhale, 1995]","[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),"[Father, of, the, Bride, Part, II, 1995]",[Comedy]


In [107]:
# Build a 1-dimensional array with movie title
titles = train_movie_data[['title', 'genres']]
indices = pd.Series(train_movie_data.index, index = train_movie_data['title'])

# Function that get movie recommendations
# method: 'standard', 'tripletNN'
def movie_recommendations(title, movie_corr, movie_cowatched_s, method = 'tripletNN', k = 20):
    print(method)
    print(titles[titles['title'] == title])
    idx = indices[title]
    sim_scores = list(enumerate(movie_corr[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:(k+1)]
     
    rel_scores = [movie_cowatched_s[idx, j] for j in [i[0] for i in sim_scores]]
    print(rel_scores)

    DCG_k = sum([(2**i[1] - 1)/(np.log2((i[0]+1)+1)) \
                 for i in list(enumerate(rel_scores))])
    IDCG_k = sum([(2**i[1] - 1)/(np.log2((i[0]+1)+1)) \
                  for i in list(enumerate(sorted(rel_scores, reverse=True)))])
    NDCG_k = DCG_k/(IDCG_k+0.0001)
    print(NDCG_k)

    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices], NDCG_k

In [108]:
standard_RS_lists = movie_recommendations('Heat (1995)', \
                      movie_correlation, movie_cowatched_s, method = 'standard', k = 20)

standard
         title                     genres
5  Heat (1995)  [Action, Crime, Thriller]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0


In [109]:
standard_RS_lists[0].head(10)

Unnamed: 0,title,genres
4541,"Heat, The (2013)","[Action, Comedy, Crime]"
2015,Shaft (2000),"[Action, Crime, Thriller]"
464,"Rock, The (1996)","[Action, Adventure, Thriller]"
2004,Shaft (1971),"[Action, Crime, Drama, Thriller]"
1638,Body Heat (1981),"[Crime, Thriller]"
3076,"Punisher, The (2004)","[Action, Crime, Thriller]"
4411,Get the Gringo (2012),"[Action, Crime, Drama, Thriller]"
405,Batman (1989),"[Action, Crime, Thriller]"
1285,Ronin (1998),"[Action, Crime, Thriller]"
2930,"Rookie, The (1990)","[Action, Comedy, Thriller]"


In [110]:
tripletNN_RS_lists = movie_recommendations('Sicario (2015)', \
                      movie_correlation_tripletNN, movie_cowatched_s, method = 'tripletNN', k = 20)

tripletNN
               title                   genres
4871  Sicario (2015)  [Crime, Drama, Mystery]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0


In [111]:
tripletNN_RS_lists[0].head(10)

Unnamed: 0,title,genres
3789,Gone Baby Gone (2007),"[Crime, Drama, Mystery]"
3956,Galaxy Express 999 (Ginga tetsudô Three-Nine) ...,"[Adventure, Animation, Fantasy, Sci-Fi]"
3455,Fantastic Four (2005),"[Action, Adventure, Sci-Fi]"
4846,Star Trek Beyond (2016),"[Action, Adventure, Sci-Fi]"
2943,Mystic River (2003),"[Crime, Drama, Mystery]"
3291,Duel (1971),"[Action, Mystery, Thriller]"
3246,Alfie (2004),"[Comedy, Drama, Romance]"
4982,The Call Up (2016),"[Action, Adventure, Sci-Fi]"
1758,"Bat Whispers, The (1930)","[Crime, Drama, Mystery]"
3567,Lucky Number Slevin (2006),"[Crime, Drama, Mystery]"


In [112]:
titles[movie_cowatched.sum(axis = 1)> 2].shape

(229, 2)

In [113]:
tripletNN_RS_lists[0].iloc[5, 1]

['Action', 'Mystery', 'Thriller']