In [2]:
import numpy as np
import numpy.ma as ma
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import csv

from numpy import genfromtxt
from collections import defaultdict
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate

SHUFFLE_BUFFER = 500
BATCH_SIZE = 256

In [3]:
# get datasets
mv_train = pd.read_csv('data/final_movies.csv', nrows=600000).fillna(0)
us_train = pd.read_csv('data/final_users.csv', nrows=600000).fillna(0)
y_train = pd.read_csv('data/y_train.csv', nrows=600000).fillna(0)

In [4]:
num_user_features = us_train.shape[1] - 2  # remove userid, rating count and ave rating during training
num_item_features = mv_train.shape[1] - 2  # remove movie id and release year at train time

In [5]:
# view movie dataset
print(f'shape of dataset: {mv_train.shape}')
mv_train.head()

shape of dataset: (600000, 21)


Unnamed: 0,movieId,Release Year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,2840,1999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,481,1993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,307,1993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1591,1997,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,3826,2000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [6]:
# view user dataset
print(f'shape of dataset: {us_train.shape}')
us_train.head()

shape of dataset: (600000, 21)


Unnamed: 0,userId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.0,2.7,1.5,0.0,0.0,4.0,3.0,0.0,3.5,...,0.0,2.2,0.0,0.0,0.0,3.5,2.1,2.6,0.0,0.0
1,1,0.0,2.7,1.5,0.0,0.0,4.0,3.0,0.0,3.5,...,0.0,2.2,0.0,0.0,0.0,3.5,2.1,2.6,0.0,0.0
2,1,0.0,2.7,1.5,0.0,0.0,4.0,3.0,0.0,3.5,...,0.0,2.2,0.0,0.0,0.0,3.5,2.1,2.6,0.0,0.0
3,1,0.0,2.7,1.5,0.0,0.0,4.0,3.0,0.0,3.5,...,0.0,2.2,0.0,0.0,0.0,3.5,2.1,2.6,0.0,0.0
4,1,0.0,2.7,1.5,0.0,0.0,4.0,3.0,0.0,3.5,...,0.0,2.2,0.0,0.0,0.0,3.5,2.1,2.6,0.0,0.0


In [7]:
# view y hat 
print(f'shape of dataset: {y_train.shape}')
y_train.head()

shape of dataset: (600000, 1)


Unnamed: 0,rating
0,3.0
1,3.5
2,3.5
3,1.5
4,2.0


In [8]:
# turn (y_hat) ratings dataframe to a 1D array
y_train = y_train.values.flatten()

In [9]:
print(y_train[:5])

[3.  3.5 3.5 1.5 2. ]


In [10]:
# scale training data
item_train_unscaled = mv_train
user_train_unscaled = us_train
y_train_unscaled    = y_train

scalerItem = StandardScaler()
scalerItem.fit(mv_train)
mv_train = scalerItem.transform(mv_train)

scalerUser = StandardScaler()
scalerUser.fit(us_train)
us_train = scalerUser.transform(us_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))
#ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(mv_train), equal_nan=True))
print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(us_train)))

True
True


Split data into training and test sets

In [11]:
mv_train, item_test = train_test_split(mv_train, train_size=0.80, shuffle=True, random_state=1)
us_train, user_test = train_test_split(us_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {mv_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (480000, 21)
movie/item test data shape: (120000, 21)


In [12]:
print(f'scaled movie feature: \n {mv_train[0]} \n')
print(f'scaled user feature: \n {us_train[0]}')

scaled movie feature: 
 [-0.59365555 -0.96022777 -0.66196456 -0.53724236 -0.27115992 -0.30015415
  1.32284621 -0.46460436 -0.12427067  1.09847761 -0.3509867  -0.07953891
 -0.26829779 -0.23328973 -0.16379664 -0.30680257  2.04357694 -0.43915777
 -0.6588441  -0.21700081 -0.13079002] 

scaled user feature: 
 [-1.27465419 -0.39388553 -0.3406056  -0.35707285  0.20298873 -0.02806365
 -0.26417513 -0.09738409  1.19899454 -0.02657825 -0.13498496  0.80629802
 -0.11259928  0.43425268  0.20549945 -0.07152251  0.09044831  0.31976052
 -0.2435045   0.23325987  0.63741139]


In [13]:
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([    
    tf.keras.layers.Dense(738, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
  
])

item_NN = tf.keras.models.Sequential([
  
    tf.keras.layers.Dense(738, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)

])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 19)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 19)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 32)           113480      ['input_1[0][0]']                
                                                                                                  
 sequential_1 (Sequential)      (None, 32)           113480      ['input_2[0][0]']                
                                                                                              

In [14]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt,
              loss=cost_fn)

In [15]:
tf.random.set_seed(1)
history = model.fit([us_train[:,2:], mv_train[:, 2:]], y_train, batch_size=BATCH_SIZE, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [17]:
model.evaluate([user_test[:,2:], item_test[:, 2:]], y_test)



0.14645935595035553

In [24]:
new_user_id = 30000
new_rating_ave = 0.0
new_action = 5.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 5.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_film_noir = 0.0
new_horror = 0.0
new_imax = 0.0
new_musical = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_war = 0.0
new_western = 0.0

user_vec = np.array([[new_user_id, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_film_noir, new_horror, new_imax, new_mystery,
                      new_musical, new_romance, new_scifi, new_thriller, new_war, 
                      new_western]])

In [25]:
item_vecs = genfromtxt('item_vecs.csv', delimiter=',')

In [26]:
# create a movie dictionary for pulling out needed movies
movie_dict = defaultdict(dict)
count = 0

with open('ml-latest/movies.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for line in reader:
        if count == 0:
            count += 1  #skip header
            #print(line) print
        else:
            count += 1
            movie_id = int(line[0])
            movie_dict[movie_id]["title"] = line[1]
            movie_dict[movie_id]["genres"] = line[2]

In [27]:
# generate a copy of user vector with matching number of movies in the dataset
def gen_user_vecs(user_vec, num_items):
    """ given a user vector return:
        user predict maxtrix to match the size of item_vecs """
    user_vecs = np.tile(user_vec, (num_items, 1))
    return user_vecs

In [28]:
# print predicted movies in a table form for the new user
def print_pred_movies(y_p, item, movie_dict, maxcount=10):
    """ print results of prediction of a new user. inputs are expected to be in
        sorted order, unscaled. """
    count = 0
    disp = [["y_p", "movie id", "rating ave", "title", "genres"]]

    for i in range(0, y_p.shape[0]):
        if count == maxcount:
            break
        count += 1
        movie_id = item[i, 0].astype(int)
        disp.append([np.around(y_p[i, 0], 1), item[i, 0].astype(int), np.around(item[i, 2].astype(float), 1),
                     movie_dict[movie_id]['title'], movie_dict[movie_id]['genres']])

    table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
    return table


In [29]:
# call gen_user_vecs
user_vecs = gen_user_vecs(user_vec,len(item_vecs))

# scale user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, 2:], sitem_vecs[:, 2:]])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display

print_pred_movies(sorted_ypu, sorted_items, movie_dict, maxcount = 40)

  58/1081 [>.............................] - ETA: 1s





y_p,movie id,rating ave,title,genres
4.6,176101,1,Kingsman: The Golden Circle (2017),Action|Adventure|Comedy
4.6,159690,1,Teenage Mutant Ninja Turtles: Out of the Shadows (2016),Action|Adventure|Comedy
4.6,117440,1,Lupin the 3rd (2014),Action|Adventure|Comedy
4.6,183583,1,The King's Case Note (2017),Action|Adventure|Comedy
4.6,139797,1,Smosh: The Movie (2015),Action|Adventure|Comedy
4.6,32596,1,Sahara (2005),Action|Adventure|Comedy
4.6,138036,1,The Man from U.N.C.L.E. (2015),Action|Adventure|Comedy
4.6,171379,1,Ali Baba ve 7 Cüceler (2015),Action|Adventure|Comedy
4.6,31330,1,Strawberries in the Supermarket (Jagoda u supermarketu) (2003),Action|Adventure|Comedy
4.6,125527,1,Slap Shot 3: The Junior League (2008),Action|Adventure|Comedy


In [30]:
item_vecs = np.nan_to_num(item_vecs, nan=0)
item_vecs

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.00000e+00, 1.99500e+03, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [2.00000e+00, 1.99500e+03, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [1.93880e+05, 1.99400e+03, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.93882e+05, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.93886e+05, 1.99900e+03, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [31]:
def sq_dist(a,b):
    """
    Returns the squared distance between two vectors
    Args:
      a (ndarray (n,)): vector with n features
      b (ndarray (n,)): vector with n features
    Returns:
      d (float) : distance
    """
    d = 0
    s = a.shape
    
    d = np.sum(np.square(a - b))
     
    return d

In [32]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))    # input layer
vm_m = item_NN(input_item_m)                                       # use the trained item_NN
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)                        # incorporate normalization as was done in the original model
model_m = tf.keras.Model(input_item_m, vm_m)                                
model_m.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 19)]              0         
                                                                 
 sequential_1 (Sequential)   (None, 32)                113480    
                                                                 
 tf.math.l2_normalize_2 (TFO  (None, 32)               0         
 pLambda)                                                        
                                                                 
Total params: 113,480
Trainable params: 113,480
Non-trainable params: 0
_________________________________________________________________


In [101]:
scaled_item_vecs = scalerItem.transform(item_vecs[:10000])
vms = model_m.predict(scaled_item_vecs[:,2:])
print(f"size of all predicted movie feature vectors: {vms.shape}")

 58/313 [====>.........................] - ETA: 0s



size of all predicted movie feature vectors: (10000, 32)


In [102]:
dim = len(vms)
dist = np.zeros((dim,dim))

for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])
        
m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0]))  # mask the diagonal


In [139]:
count = 50
disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i+1,0])
    movie2_id = int(item_vecs[min_idx,0])
    disp.append( [movie_dict[movie1_id]['title'], movie_dict[movie1_id]['genres'],
                  movie_dict[movie2_id]['title'], movie_dict[movie1_id]['genres']]
               )
table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")

In [140]:
table

movie1,genres,movie2,genres.1
Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Kicking and Screaming (1995),Adventure|Animation|Children|Comedy|Fantasy
Jumanji (1995),Adventure|Children|Fantasy,Antz (1998),Adventure|Children|Fantasy
Grumpier Old Men (1995),Comedy|Romance,"Indian in the Cupboard, The (1995)",Comedy|Romance
Waiting to Exhale (1995),Comedy|Drama|Romance,Sabrina (1995),Comedy|Drama|Romance
Father of the Bride Part II (1995),Comedy,"American President, The (1995)",Comedy
Heat (1995),Action|Crime|Thriller,Four Rooms (1995),Action|Crime|Thriller
Sabrina (1995),Comedy|Romance,Assassins (1995),Comedy|Romance
Tom and Huck (1995),Adventure|Children,Grumpier Old Men (1995),Adventure|Children
Sudden Death (1995),Action,"Amazing Panda Adventure, The (1995)",Action
GoldenEye (1995),Action|Adventure|Thriller,Fair Game (1995),Action|Adventure|Thriller
