In [18]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np

In [3]:
class Matrix_Factorization(tf.keras.Model):
    pass


In [19]:
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    '../data/ml-100k/ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    '../data/ml-100k/ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    '../data/ml-100k/ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

# Since the ids start at 1, we shift them to start at 0.
users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: str(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

# Since some movies can belong to more than one genre, we create different
# 'genre' columns as follows:
# - all_genres: all the active genres of the movie.
# - genre: randomly sampled from the active genres.
def mark_genres(movies, genres):
    def get_random_genre(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        if len(active) == 0:
            return 'Other'
        return np.random.choice(active)
    def get_all_genres(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        if len(active) == 0:
            return 'Other'
        return '-'.join(active)
    movies['genre'] = [
          get_random_genre(gs) for gs in zip(*[movies[genre] for genre in genres])]
    movies['all_genres'] = [
          get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]

mark_genres(movies, genre_cols)

# Create one merged DataFrame containing all the movielens data.
movielens = ratings.merge(movies, on='movie_id').merge(users, on='user_id')

In [24]:
ratings.head(3)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,195,241,3.0,881250949
1,185,301,3.0,891717742
2,21,376,1.0,878887116


In [22]:
def build_rating_sparse_tensor(ratings_df):
    """
    Args:
        ratings_df: a pd.DataFrame with `user_id`, `movie_id` and `rating` columns.
      Returns:
        a tf.SparseTensor representing the ratings matrix.
    """
    indices = ratings_df[['user_id', 'movie_id']].values
    values = ratings_df['rating'].values
    return tf.SparseTensor(
          indices=indices,
          values=values,
          dense_shape=[users.shape[0], movies.shape[0]])

In [30]:
sparse_rating = build_rating_sparse_tensor(ratings)
sparse_rating.values[:3], sparse_rating.indices[:3]

(<tf.Tensor: shape=(3,), dtype=float64, numpy=array([3., 3., 1.])>,
 <tf.Tensor: shape=(3, 2), dtype=int64, numpy=
 array([[195, 241],
        [185, 301],
        [ 21, 376]], dtype=int64)>)

In [34]:
ratings["user_id"].unique().__len__(), ratings["movie_id"].unique().__len__()

(943, 1682)

In [38]:
user_embedding = tf.Variable(tf.random.normal(shape=[sparse_rating.dense_shape[0], 5], stddev=1.0))
item_embedding = tf.Variable(tf.random.normal(shape=[sparse_rating.dense_shape[1], 5], stddev=1.0))

In [42]:
tf.reduce_sum(tf.gather(user_embedding, sparse_rating.indices[:, 0])*tf.gather(item_embedding, sparse_rating.indices[:, 1]), axis=1)

<tf.Tensor: shape=(100000,), dtype=float32, numpy=
array([-3.3062525 , -3.5642443 ,  4.115395  , ...,  0.17213285,
       -6.2068768 , -0.7492338 ], dtype=float32)>

In [44]:
sparse_rating.values

<tf.Tensor: shape=(100000,), dtype=float64, numpy=array([3., 3., 1., ..., 1., 2., 3.])>

In [45]:
def sparse_mean_square_error(sparse_ratings, user_embeddings, movie_embeddings):
    """
    Args:
    sparse_ratings: A SparseTensor rating matrix, of dense_shape [N, M]
    user_embeddings: A dense Tensor U of shape [N, k] where k is the embedding
      dimension, such that U_i is the embedding of user i.
    movie_embeddings: A dense Tensor V of shape [M, k] where k is the embedding
      dimension, such that V_j is the embedding of movie j.
    Returns:
    A scalar Tensor representing the MSE between the true ratings and the
      model's predictions.
    """
    predictions = tf.reduce_sum(
        tf.gather(user_embeddings, sparse_ratings.indices[:, 0]) *
        tf.gather(movie_embeddings, sparse_ratings.indices[:, 1]),
        axis=1)
    loss = tf.losses.mean_squared_error(sparse_ratings.values, predictions)
    return loss

In [55]:
trainable_weights = [user_embedding, item_embedding]
optimizer = tf.optimizers.Adam()
for i in range(1000):
    with tf.GradientTape() as tape:
        loss = sparse_mean_square_error(sparse_rating, user_embedding, item_embedding)
    grads = tape.gradient(loss, trainable_weights)
    optimizer.apply_gradients(zip(grads, trainable_weights))

In [57]:
loss

<tf.Tensor: shape=(), dtype=float32, numpy=1.3984052>

In [59]:
predict_ratings = tf.matmul(user_embedding, item_embedding, transpose_b=True)
predict_ratings.shape

TensorShape([943, 1682])

In [61]:
tf.argsort(predict_ratings[1, :].numpy())

<tf.Tensor: shape=(1682,), dtype=int32, numpy=array([1572, 1485, 1574, ...,  167, 1456, 1397])>