In [35]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve
from sklearn.neighbors import KNeighborsClassifier
import random
import matplotlib.patheffects as PathEffects

from sklearn.metrics.pairwise import pairwise_distances

from random import sample

In [2]:
import tensorflow as tf

In [3]:
from keras.layers import Input, Conv2D, Lambda, Dense, Flatten,MaxPooling2D, concatenate

from keras.models import Model, Sequential
from keras.regularizers import l2
from keras import backend as K
from keras.optimizers import SGD,Adam
from keras.losses import binary_crossentropy
import os
import pickle
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [4]:
from itertools import permutations
import seaborn as sns

from keras.datasets import mnist
from sklearn.manifold import TSNE

from sklearn.svm import SVC

In [5]:
import spacy
import en_core_web_sm

import re

from sklearn.decomposition import PCA

'ratings.csv' dataset consists of 
- the user who rated the movie (userID)
- the movie is rated (movieID)
- the rating given by the user for that particular movie (rating)
- the time at which the rating was recorded (timestamp)

In [6]:
df_ratings = pd.read_csv('ml-25m/ratings.csv')
df_ratings.shape

(25000095, 4)

'movies.csv' dataset consists of
- the movie id (movieID)
- the movie title (title)
- the genres (genres)

In [7]:
df_movie = pd.read_csv('ml-25m/movies.csv')
df_movie.shape

(62423, 3)

In [8]:
nlp = en_core_web_sm.load()

# import the list of stop words from the spacy library
from spacy.lang.en.stop_words import STOP_WORDS

def remove_stop_words(text):
    return ' '.join([word for word in text.split(' ') if word.lower() not in STOP_WORDS])

print(remove_stop_words('why is my dog on the drugs'))


dog drugs


In [9]:
## Genres:

# Break up the big genre string into a string array
df_movie['genres'] = df_movie['genres'].str.split('|')

# Convert genres to string value
df_movie['genres'] = df_movie['genres'].fillna("")

In [10]:
## Title:

# regular expression to extract year and title
p1 = re.compile(r'[(](.*?)[)]', re.S)

df_movie['title_l'] = df_movie['title'].apply(lambda x: re.findall(r"[\w']+", x))

In [11]:
def get_word_vec(l):
    return nlp(remove_stop_words(' '.join(l))).vector

# Combine movie and rating datasets
## Sample a small dataset of ratings

In [12]:
small_data = df_ratings.sample(frac = 0.001)
small_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 2962376 to 14277444
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     25000 non-null  int64  
 1   movieId    25000 non-null  int64  
 2   rating     25000 non-null  float64
 3   timestamp  25000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 976.6 KB


In [13]:
small_data = pd.merge(small_data, df_movie, how = 'left', on = 'movieId')
print(small_data.shape)

print(len(set(small_data['movieId'])), len(set(small_data['userId'])))

(25000, 7)
5786 19967


In [14]:
small_data.drop(['timestamp'], inplace = True, axis = 1)

small_data.head()

Unnamed: 0,userId,movieId,rating,title,genres,title_l
0,19534,3031,1.0,Repossessed (1990),[Comedy],"[Repossessed, 1990]"
1,62797,5013,5.0,Gosford Park (2001),"[Comedy, Drama, Mystery]","[Gosford, Park, 2001]"
2,123140,650,3.0,Moll Flanders (1996),[Drama],"[Moll, Flanders, 1996]"
3,8950,7317,2.5,EuroTrip (2004),"[Adventure, Comedy]","[EuroTrip, 2004]"
4,126523,8865,4.0,Sky Captain and the World of Tomorrow (2004),"[Action, Adventure, Sci-Fi]","[Sky, Captain, and, the, World, of, Tomorrow, ..."


In [15]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(small_data, test_size=0.2)

print(train_data.shape, test_data.shape)

(20000, 6) (5000, 6)


In [16]:
train_movie_data = train_data[['movieId', 'title', 'title_l', 'genres']].drop_duplicates('movieId')
train_movie_data = train_movie_data.sort_values(by = 'movieId').reset_index(drop = True)

print(train_movie_data.shape)

test_movie_data = test_data[['movieId', 'title', 'title_l', 'genres']].drop_duplicates('movieId')
test_movie_data = test_movie_data.sort_values(by = 'movieId').reset_index(drop = True)

print(test_movie_data.shape)

(5226, 4)
(2507, 4)


In [17]:
train_movie_data_vec = pd.DataFrame(0, index=train_movie_data['movieId'], 
                                   columns=range(96))

test_movie_data_vec = pd.DataFrame(0, index=test_movie_data['movieId'], 
                                   columns=range(96))

print(train_movie_data_vec.shape, test_movie_data_vec.shape)

for i in range(train_movie_data.shape[0]):
    tmp_content = train_movie_data['title_l'][i] + train_movie_data['genres'][i]
    train_movie_data_vec.loc[train_movie_data['movieId'][i], :] = pd.Series(get_word_vec(tmp_content))
    
for i in range(test_movie_data.shape[0]):
    tmp_content = test_movie_data['title_l'][i] + test_movie_data['genres'][i]
    test_movie_data_vec.loc[test_movie_data['movieId'][i], :] = pd.Series(get_word_vec(tmp_content))


(5226, 96) (2507, 96)


In [18]:
train_movie_data_vec.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.544457,1.982641,-1.609174,-0.832535,0.706813,0.843196,-1.082757,0.479867,0.455717,-1.350987,...,-1.389453,-0.612897,3.257624,1.612306,-0.675844,0.68078,0.795322,-2.220095,-1.02624,-0.928899
2,-0.251009,1.858588,-2.27401,-0.327125,1.16685,0.634994,-0.510081,-0.884999,0.006366,-1.454704,...,-1.407658,-0.058199,2.621021,1.749237,-0.923112,0.679678,0.854836,-1.344364,-0.602621,-1.191469
3,-1.025808,2.101699,-0.80808,-0.662055,0.155747,1.403549,-1.099139,-1.171176,1.717982,-0.008322,...,-0.931525,-0.423738,2.26852,2.035343,-1.238552,0.012303,1.313296,-2.841366,-0.80522,0.286188
4,-1.167611,2.240464,-2.007942,-1.945721,-0.125155,-0.174539,-1.665239,-0.648345,0.702949,0.82251,...,-1.039095,-0.407382,2.383182,1.146705,-1.876704,-1.051628,-0.027557,-1.542852,-0.727483,0.113651
5,-1.56313,2.395524,-0.576473,-1.212703,-1.081023,-0.059765,-1.371703,0.0855,0.818381,-1.087977,...,-1.169376,-0.157378,2.914577,2.291084,-1.752722,0.49484,0.904072,-2.267776,-0.628808,-0.172327


Calculate the similarity matrix based on the content features

In [20]:
movie_correlation = 1 - pairwise_distances(train_movie_data_vec, metric = 'correlation')
movie_correlation[np.isnan(movie_correlation)] = 0
print(movie_correlation.shape)

(5226, 5226)


In [21]:
movie_correlation[:4, :4]

array([[1.        , 0.89710642, 0.89283011, 0.86406222],
       [0.89710642, 1.        , 0.80089332, 0.81801318],
       [0.89283011, 0.80089332, 1.        , 0.87170862],
       [0.86406222, 0.81801318, 0.87170862, 1.        ]])

# Construct co-watched/rated graph for movies
An edge exists between two movies if many users rated both movies

In [22]:
train_user_data = train_data[['userId', 'movieId']]
train_user_data = train_user_data.sort_values(by = 'movieId').reset_index(drop = True)
train_user_data.head()

Unnamed: 0,userId,movieId
0,126892,1
1,39467,1
2,113178,1
3,47064,1
4,124628,1


In [23]:
len(set(train_user_data['movieId'])), train_user_data.shape[0]

(5226, 20000)

Construct a user dictionary where the key is the movieId and the values are the userIds who rated this movie

In [24]:
d_user = {}
# 
for i in range(train_user_data.shape[0]):
    if train_user_data['movieId'][i] not in d_user:
        d_user[train_user_data['movieId'][i]] = []
    else:
        d_user[train_user_data['movieId'][i]].append(train_user_data['userId'][i])

print(len(d_user))

5226


Create movie-to-movie user co-watched (rated) matrix
- cell_{i, j}: # of users who watched/rated both movie i and j

In [25]:
movie_cowatched = np.zeros((len(train_movie_data['movieId']), len(train_movie_data['movieId'])))

for r in range(movie_cowatched.shape[0]):
    for c in range(r+1, movie_cowatched.shape[1]):
        movie_cowatched[r, c] = len(set(d_user[train_movie_data['movieId'][r]]\
                                ).intersection(set(d_user[ train_movie_data['movieId'][c]])))

# Symmetric matrix for co-watched movie matrix
movie_cowatched.T.sum(), movie_cowatched.sum()
movie_cowatched_s = movie_cowatched + movie_cowatched.T

print(movie_cowatched_s.shape)

(5226, 5226)


In [26]:
movie_cowatched_s[:4, :4]

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [27]:
movie_cowatched.sum(), movie_cowatched.shape

(1846.0, (5226, 5226))

# CDML: Triplet NN

### Get a list of all movie pairs with # of cowatched > 0

In [28]:
cowatched_list = []
# Getting the list of all pairs with # of cowatched > 0
for r in range(movie_cowatched_s.shape[0]):
    for c in range(r+1, movie_cowatched_s.shape[1]):
        if movie_cowatched_s[r, c] > 0:
            cowatched_list.append((r, c))
            
print(len(cowatched_list), cowatched_list[:5])

1845 [(0, 575), (0, 699), (0, 1654), (4, 889), (4, 1459)]


### Construct a dictionary with key = movieId, and values = set of index who have # of cowatched = 0

In [30]:
zero_cowatched_dict = {}

for r in range(movie_cowatched_s.shape[0]):
    zero_cowatched_dict[r] = set(c for c in range(movie_cowatched_s.shape[0]) \
                              if movie_cowatched_s[r, c] == 0 and r != c)


# Data Preprocessing

In [31]:
x_train_movie = train_movie_data_vec.values
x_test_movie = test_movie_data_vec.values

print(x_train_movie.shape, x_test_movie.shape)

(5226, 96) (2507, 96)


In [32]:
x_train_movie[:4, :4]

array([[-0.5444566 ,  1.9826405 , -1.60917366, -0.83253467],
       [-0.25100935,  1.8585875 , -2.27401018, -0.32712525],
       [-1.02580774,  2.10169935, -0.80807978, -0.6620546 ],
       [-1.167611  ,  2.24046421, -2.0079422 , -1.94572103]])

### Generate triplets

In [33]:

def generate_triplet(cowatched_list, zero_cowatched_dict, ap_pairs, an_pairs, testsize):
 
    #ap_pairs, an_pairs = 10, 10
    #testsize = 0.2 

    trainsize = 1 - testsize
    triplet_train_pairs = []
    triplet_test_pairs = []

    A_P_pairs = random.sample(cowatched_list, k = ap_pairs)
    Neg_idx = []
    for p in range(len(A_P_pairs)):
        Neg_idx.append(sample(zero_cowatched_dict[A_P_pairs[p][0]].intersection(zero_cowatched_dict[A_P_pairs[p][1]]), 1)[0])

    # Train
    A_P_len = len(A_P_pairs)
    Neg_len = len(Neg_idx)
    train_i = 0
    for ap in A_P_pairs[:int(A_P_len*trainsize)]:
        # print(ap, train_i)
        Anchor = x_train_movie[ap[0]]
        Positive = x_train_movie[ap[1]]
        Negative = x_train_movie[Neg_idx[train_i]]
        triplet_train_pairs.append([Anchor, Positive, Negative])
        train_i += 1

    # Test
    test_i = int(A_P_len*trainsize)
    for ap in A_P_pairs[int(A_P_len*trainsize):]:
        #print(ap, test_i)
        Anchor = x_train_movie[ap[0]]
        Positive = x_train_movie[ap[1]]
        Negative = x_train_movie[Neg_idx[test_i]]
        triplet_test_pairs.append([Anchor, Positive, Negative])
        test_i += 1
    
    return np.array(triplet_train_pairs), np.array(triplet_test_pairs)

In [36]:
X_train, X_test = generate_triplet(cowatched_list, zero_cowatched_dict, \
                                   ap_pairs=1000, an_pairs=1000,testsize=0.2)
X_train.shape, X_test.shape

((800, 3, 96), (200, 3, 96))

## Optimize a ranking Triplet loss

In [37]:
def triplet_loss(y_true, y_pred, alpha = 0.4):
    """
    Implementation of the triplet loss function
    Arguments:
    y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
    y_pred -- python list containing three objects:
            anchor -- the encodings for the anchor data
            positive -- the encodings for the positive data (similar to anchor)
            negative -- the encodings for the negative data (different from anchor)
    Returns:
    loss -- real number, value of the loss
    """
    print('y_pred.shape = ',y_pred)
    
    total_lenght = y_pred.shape.as_list()[-1]
#     print('total_lenght=',  total_lenght)
#     total_lenght =12
    
    anchor = y_pred[:,0:int(total_lenght*1/3)]
    positive = y_pred[:,int(total_lenght*1/3):int(total_lenght*2/3)]
    negative = y_pred[:,int(total_lenght*2/3):int(total_lenght*3/3)]

    # distance between the anchor and the positive
    pos_dist = K.sum(K.square(anchor-positive),axis=1)

    # distance between the anchor and the negative
    neg_dist = K.sum(K.square(anchor-negative),axis=1)

    # compute loss
    basic_loss = pos_dist-neg_dist+alpha
    loss = K.maximum(basic_loss,0.0)
 
    return loss

In [38]:
def create_base_network(in_dims):
    """
    Base network to be shared.
    """
    model = Sequential()
    model.add(Conv2D(128,(7,7),padding='same',input_shape=(in_dims[0],in_dims[1],in_dims[2],),activation='relu',name='conv1'))
    model.add(MaxPooling2D((2,2),(2,2),padding='same',name='pool1'))
    model.add(Conv2D(256,(5,5),padding='same',activation='relu',name='conv2'))
    model.add(MaxPooling2D((2,2),(2,2),padding='same',name='pool2'))
    model.add(Flatten(name='flatten'))
    model.add(Dense(96,name='embeddings')) # No activation on final dense layer
    model.add(Lambda(lambda x: tf.math.l2_normalize(x, axis = 1)))
    # L2 normalize embeddings
    # model.add(Dense(600))
    
    return model

In [39]:
adam_optim = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999)

In [40]:
anchor_input = Input((96,1,1,), name='anchor_input')
positive_input = Input((96,1,1,), name='positive_input')
negative_input = Input((96,1,1,), name='negative_input')

# Shared embedding layer for positive and negative items
Shared_DNN = create_base_network([96,1,1,])
# Shared_DNN = create_base_network([12,8,1,])


encoded_anchor = Shared_DNN(anchor_input)
encoded_positive = Shared_DNN(positive_input)
encoded_negative = Shared_DNN(negative_input)

merged_vector = concatenate([encoded_anchor, encoded_positive, encoded_negative], axis=-1, name='merged_layer')

model = Model(inputs=[anchor_input,positive_input, negative_input], outputs=merged_vector)
model.compile(loss=triplet_loss, optimizer=adam_optim)

y_pred.shape =  Tensor("merged_layer/concat:0", shape=(None, 288), dtype=float32)


In [41]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
anchor_input (InputLayer)       (None, 96, 1, 1)     0                                            
__________________________________________________________________________________________________
positive_input (InputLayer)     (None, 96, 1, 1)     0                                            
__________________________________________________________________________________________________
negative_input (InputLayer)     (None, 96, 1, 1)     0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 96)           1415776     anchor_input[0][0]               
                                                                 positive_input[0][0]       

In [42]:
# Time consuming

Anchor = X_train[:,0,:].reshape(-1,96,1,1)
Positive = X_train[:,1,:].reshape(-1,96,1,1)
Negative = X_train[:,2,:].reshape(-1,96,1,1)
Anchor_test = X_test[:,0,:].reshape(-1,96,1,1)
Positive_test = X_test[:,1,:].reshape(-1,96,1,1)
Negative_test = X_test[:,2,:].reshape(-1,96,1,1)

Y_dummy = np.empty((Anchor.shape[0],300))
Y_dummy2 = np.empty((Anchor_test.shape[0],1))

model.fit([Anchor,Positive,Negative],y=Y_dummy,validation_data=([Anchor_test,Positive_test,Negative_test],Y_dummy2), batch_size=512, epochs=50)

Train on 800 samples, validate on 200 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x1b82036810>

In [43]:
trained_model = Model(inputs=anchor_input, outputs=encoded_anchor)

x_train_movie_pred = trained_model.predict(x_train_movie.reshape(-1, 96, 1, 1))
x_test_movie_pred = trained_model.predict(x_test_movie.reshape(-1, 96, 1, 1))

print(x_train_movie_pred.shape, x_test_movie_pred.shape)

print(len(x_test_movie_pred[0]), np.sqrt((x_test_movie_pred[0]**2).sum())) # L2-norm

(5226, 96) (2507, 96)
96 0.99999994


Calculate similary matrix based on the embedding movie features

In [44]:
movie_correlation_tripletNN = 1 - pairwise_distances(x_train_movie_pred, metric = 'correlation')
movie_correlation_tripletNN[np.isnan(movie_correlation_tripletNN)] = 0

In [45]:
movie_correlation[:4, :4]

array([[1.        , 0.89710642, 0.89283011, 0.86406222],
       [0.89710642, 1.        , 0.80089332, 0.81801318],
       [0.89283011, 0.80089332, 1.        , 0.87170862],
       [0.86406222, 0.81801318, 0.87170862, 1.        ]])

In [46]:
movie_correlation_tripletNN[:4, :4]

array([[1.        , 0.35811315, 0.49267136, 0.2441658 ],
       [0.35811315, 1.        , 0.16613979, 0.11602336],
       [0.49267136, 0.16613979, 1.        , 0.26052436],
       [0.2441658 , 0.11602336, 0.26052436, 1.        ]])

In [47]:
train_movie_data.head()

Unnamed: 0,movieId,title,title_l,genres
0,1,Toy Story (1995),"[Toy, Story, 1995]","[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Jumanji, 1995]","[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Grumpier, Old, Men, 1995]","[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Waiting, to, Exhale, 1995]","[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),"[Father, of, the, Bride, Part, II, 1995]",[Comedy]


In [48]:
# Build a 1-dimensional array with movie title
titles = train_movie_data[['title', 'genres']]
indices = pd.Series(train_movie_data.index, index = train_movie_data['title'])

# Function that get movie recommendations
# method: 'standard', 'tripletNN'
def movie_recommendations(title, movie_corr, movie_cowatched_s, method = 'tripletNN', k = 20):
    print(method)
    print(titles[titles['title'] == title])
    idx = indices[title]
    sim_scores = list(enumerate(movie_corr[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:(k+1)]
     
    rel_scores = [movie_cowatched_s[idx, j] for j in [i[0] for i in sim_scores]]
    print(rel_scores)

    DCG_k = sum([(2**i[1] - 1)/(np.log2((i[0]+1)+1)) \
                 for i in list(enumerate(rel_scores))])
    IDCG_k = sum([(2**i[1] - 1)/(np.log2((i[0]+1)+1)) \
                  for i in list(enumerate(sorted(rel_scores, reverse=True)))])
    NDCG_k = DCG_k/(IDCG_k+0.0001)
    print(NDCG_k)

    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices], NDCG_k

In [57]:
standard_RS_lists = movie_recommendations('Heat (1995)', \
                      movie_correlation, movie_cowatched_s, method = 'standard', k = 20)

standard
         title                     genres
5  Heat (1995)  [Action, Crime, Thriller]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0


In [59]:
standard_RS_lists[0].head(10)

Unnamed: 0,title,genres
2037,Shaft (2000),"[Action, Crime, Thriller]"
3076,"Punisher, The (2004)","[Action, Crime, Thriller]"
1279,Ronin (1998),"[Action, Crime, Thriller]"
399,Batman (1989),"[Action, Crime, Thriller]"
3350,Thursday (1998),"[Action, Crime, Thriller]"
3433,Hostage (2005),"[Action, Crime, Drama, Thriller]"
325,Judgment Night (1993),"[Action, Crime, Thriller]"
1661,Someone to Watch Over Me (1987),"[Action, Crime, Thriller]"
4854,Kite (1998),"[Action, Animation, Crime, Thriller]"
923,Face/Off (1997),"[Action, Crime, Drama, Thriller]"


In [60]:
tripletNN_RS_lists = movie_recommendations('Sicario (2015)', \
                      movie_correlation_tripletNN, movie_cowatched_s, method = 'tripletNN', k = 20)

tripletNN
               title                   genres
4876  Sicario (2015)  [Crime, Drama, Mystery]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0


In [61]:
tripletNN_RS_lists[0].head(10)

Unnamed: 0,title,genres
3630,Lady in the Water (2006),"[Drama, Fantasy, Mystery]"
2435,"Monsters, Inc. (2001)","[Adventure, Animation, Children, Comedy, Fantasy]"
2180,Beverly Hills Cop II (1987),"[Action, Comedy, Crime, Thriller]"
2013,"Road Warrior, The (Mad Max 2) (1981)","[Action, Adventure, Sci-Fi, Thriller]"
3624,"Lake House, The (2006)","[Drama, Fantasy, Romance]"
4313,Harry Potter and the Deathly Hallows: Part 2 (...,"[Action, Adventure, Drama, Fantasy, Mystery, I..."
2732,Kangaroo Jack (2003),"[Action, Comedy]"
4180,Inception (2010),"[Action, Crime, Drama, Mystery, Sci-Fi, Thrill..."
3750,Spider-Man 3 (2007),"[Action, Adventure, Sci-Fi, Thriller, IMAX]"
355,RoboCop 3 (1993),"[Action, Crime, Drama, Sci-Fi, Thriller]"


In [63]:
titles[movie_cowatched.sum(axis = 1)> 2].shape

(215, 2)