# Packages

In [1]:
import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
from IPython import display
from matplotlib import pyplot as plt
import sklearn
import sklearn.manifold
import tensorflow as tf

# Movie dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

users = pd.read_csv('/content/drive/My Drive/Colab Notebooks/RecSys/Data/users.csv')
ratings = pd.read_csv('/content/drive/My Drive/Colab Notebooks/RecSys/Data/ratings.csv')
movies = pd.read_csv('/content/drive/My Drive/Colab Notebooks/RecSys/Data/movies.csv')
movielens = pd.read_csv('/content/drive/My Drive/Colab Notebooks/RecSys/Data/movielens.csv')

# function to split dataframe into test and train
def split_dataframe(df, holdout_fraction=0.1):
  test = df.sample(frac=holdout_fraction, replace=False)
  train = df[~df.index.isin(test.index)]
  return train, test

Mounted at /content/drive


In [42]:
ratings.head()

Unnamed: 0.1,Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,0,195,241,3.0,881250949
1,1,185,301,3.0,891717742
2,2,21,376,1.0,878887116
3,3,243,50,2.0,880606923
4,4,165,345,1.0,886397596


# Sparse tensor

Sparse representation of ratings matrix

In [4]:
def get_sparse_tensor(ratings_df):
  indices = ratings_df[['user_id', 'movie_id']].values
  values = ratings_df['rating'].values
  return tf.SparseTensor(
      indices=indices,
      values=values,
      dense_shape=[users.shape[0], movies.shape[0]]) # shape of full underlying matrix

# Split data into test, train and get their sparse tensors

In [5]:
train_ratings, test_ratings = split_dataframe(ratings)

A_train = get_sparse_tensor(train_ratings)
A_test = get_sparse_tensor(test_ratings)

# Create Embeddings and initialize with normal distribution

In [6]:
embedding_dim = 50
init_stddev = 0.5

U = tf.Variable(tf.random.normal([A_train.dense_shape[0], embedding_dim], stddev=init_stddev)) # user embedding
V = tf.Variable(tf.random.normal([A_train.dense_shape[1], embedding_dim], stddev=init_stddev)) # item/movie embedding

# Cost function

calculates mean squared error of observed entries

In [7]:
def cofi_cost_func_v(sparse_ratings, user_embeddings, movie_embeddings):
  predictions = tf.gather_nd(
      tf.matmul(user_embeddings, movie_embeddings, transpose_b=True), sparse_ratings.indices)

  mse = tf.keras.losses.MeanSquaredError()
  return mse(sparse_ratings.values, predictions)

print(cofi_cost_func_v(A_train, U, V))

tf.Tensor(16.923525, shape=(), dtype=float32)


# Train matrix factorization model

In [8]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-1)
iterations = 200
for iter in range(iterations):
  with tf.GradientTape() as tape:
      cost_value = cofi_cost_func_v(A_train, U, V)

  grads = tape.gradient(cost_value, [U, V])
  optimizer.apply_gradients(zip(grads, [U, V]))

  if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 16.9
Training loss at iteration 20: 1.0
Training loss at iteration 40: 0.3
Training loss at iteration 60: 0.2
Training loss at iteration 80: 0.1
Training loss at iteration 100: 0.1
Training loss at iteration 120: 0.1
Training loss at iteration 140: 0.1
Training loss at iteration 160: 0.1
Training loss at iteration 180: 0.1


# Dot and Cosine similarity measures

In [9]:
DOT = 'dot'
COSINE = 'cosine'
def compute_similarity(query_embedding, item_embeddings, measure=DOT): # similarity
  u = query_embedding
  V = item_embeddings
  if measure == COSINE:
    V = V / np.linalg.norm(V, axis=1, keepdims=True)
    u = u / np.linalg.norm(u)
  scores = np.dot(V, u)
  return scores

# Retrieve recommendations

In [39]:
def user_recommendations(userid, measure=DOT, exclude_rated=False, k=6):
  scores = compute_similarity(U[userid], V, measure)
  score_key = measure + ' score'
  df = pd.DataFrame({
      score_key: list(scores),
      'movie_id': movies['movie_id'],
      'titles': movies['title'],
      'genres': movies['all_genres'],
  })
  if exclude_rated:
    # remove movies that are already rated
    rated_movies = ratings[ratings.user_id == userid]["movie_id"].values
    df = df[df.movie_id.apply(lambda movie_id: movie_id not in rated_movies)]
  display.display(df.sort_values([score_key], ascending=False).head(k))

In [40]:
user_recommendations(923, measure=DOT, exclude_rated=True)

Unnamed: 0,dot score,movie_id,titles,genres
1250,10.544465,1250,A Chef in Love (1996),Comedy
1049,9.18156,1049,"Ghost and Mrs. Muir, The (1947)",Drama-Romance
706,8.915132,706,Enchanted April (1991),Drama
612,8.877105,612,My Man Godfrey (1936),Comedy
66,8.75095,66,Ace Ventura: Pet Detective (1994),Comedy
1093,8.630936,1093,"Thin Line Between Love and Hate, A (1996)",Comedy


# Retrieve movie neighbors

In [41]:
def movie_neighbors(title_substring, measure=DOT, k=6):
  # Search for movie ids that match the given substring.
  ids =  movies[movies['title'].str.contains(title_substring)].index.values
  titles = movies.iloc[ids]['title'].values
  if len(titles) == 0:
    raise ValueError("Found no movies with title %s" % title_substring)
  print("Nearest neighbors of : %s." % titles[0])
  if len(titles) > 1:
    print("[Found more than one matching movie. Other candidates: {}]".format(
        ", ".join(titles[1:])))
  movie_id = ids[0]
  scores = compute_similarity(
      V[movie_id], V,
      measure)
  score_key = measure + ' score'
  df = pd.DataFrame({
      score_key: list(scores),
      'titles': movies['title'],
      'genres': movies['all_genres']
  })
  display.display(df.sort_values([score_key], ascending=False).head(k))

In [16]:
movie_neighbors("Aladdin", COSINE)

Nearest neighbors of : Aladdin (1992).
[Found more than one matching movie. Other candidates: Aladdin and the King of Thieves (1996)]


Unnamed: 0,cosine score,titles,genres
94,1.0,Aladdin (1992),Animation-Children-Comedy-Musical
131,0.63866,"Wizard of Oz, The (1939)",Adventure-Children-Drama-Musical
587,0.630606,Beauty and the Beast (1991),Animation-Children-Musical
70,0.62443,"Lion King, The (1994)",Animation-Children-Musical
426,0.624003,To Kill a Mockingbird (1962),Drama
192,0.61716,"Right Stuff, The (1983)",Drama
