In [61]:
from typing import Dict, Text
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import pprint
import tempfile
#import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [2]:
df_rating = pd.read_csv('ratings.csv')

In [3]:
df_rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [69]:
df_42 = df_rating[(df_rating['userId'] == '42') & (df_rating['movieId'] == '4565')]

In [70]:
df_42.head(40)

Unnamed: 0,userId,movieId,rating,timestamp
6307,42,4565,2.0,996260446


In [5]:
df_rating = df_rating.astype(str)

In [7]:
df_rating.dtypes

userId       object
movieId      object
rating       object
timestamp    object
dtype: object

In [8]:
rating_features_dict = {name: np.array(value) 
                         for name, value in df_rating.items()}

In [9]:
rating_features_dict

{'userId': array(['1', '1', '1', ..., '610', '610', '610'], dtype=object),
 'movieId': array(['1', '3', '6', ..., '168250', '168252', '170875'], dtype=object),
 'rating': array(['4.0', '4.0', '4.0', ..., '5.0', '5.0', '3.0'], dtype=object),
 'timestamp': array(['964982703', '964981247', '964982224', ..., '1494273047',
        '1493846352', '1493846415'], dtype=object)}

In [10]:
import itertools

def slices(features):
    for i in itertools.count():
        # For each feature take index `i`
        example = {name:values[i] for name, values in features.items()}
        yield example

In [11]:
for example in slices(rating_features_dict):
    for name, value in example.items():
        print(f"{name:19s}: {value}")
    break

userId             : 1
movieId            : 1
rating             : 4.0
timestamp          : 964982703


In [12]:
features_ds = tf.data.Dataset.from_tensor_slices(rating_features_dict)

In [13]:
features_ds

<TensorSliceDataset element_spec={'userId': TensorSpec(shape=(), dtype=tf.string, name=None), 'movieId': TensorSpec(shape=(), dtype=tf.string, name=None), 'rating': TensorSpec(shape=(), dtype=tf.string, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [14]:
ratings = features_ds.map(lambda x: {
    "movie_id": x["movieId"],
    "user_id": x["userId"]
})

In [26]:
df_movies = pd.read_csv('movies.csv')

In [27]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [36]:
df_movies = df_movies.astype(str)

In [37]:
movies_features_dict = {name: np.array(value) 
                         for name, value in df_movies.items()}

In [38]:
movie_features_ds = tf.data.Dataset.from_tensor_slices(movies_features_dict)

In [39]:
movies = movie_features_ds.map(lambda x: x["movieId"])

In [40]:
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)

In [41]:
user_ids_vocabulary.adapt(ratings.map(lambda x: x["user_id"]))

In [42]:
movie_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None)

In [43]:
movie_titles_vocabulary.adapt(movies)

In [46]:
class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

    def __init__(
        self,
        user_model: tf.keras.Model,
        movie_model: tf.keras.Model,
        task: tfrs.tasks.Retrieval):
        super().__init__()

        # Set up user and movie representations.
        self.user_model = user_model
        self.movie_model = movie_model

        # Set up a retrieval task.
        self.task = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # Define how the loss is computed.

        user_embeddings = self.user_model(features["user_id"])
        movie_embeddings = self.movie_model(features["movie_id"])

        return self.task(user_embeddings, movie_embeddings)

In [47]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocab_size(), 64)
])
movie_model = tf.keras.Sequential([
    movie_titles_vocabulary,
    tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    movies.batch(128).map(movie_model)
  )
)



In [48]:
# Create a retrieval model.
model = MovieLensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model.fit(ratings.batch(4096), epochs=3)

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    movies.batch(100).map(lambda title: (title, model.movie_model(title))))

# Get some recommendations.
_, titles = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {titles[0, :3]}")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Top 3 recommendations for user 42: [b'3326' b'3326' b'3326']


In [71]:
index

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x25f089c2430>

In [66]:
# Get some recommendations.
_, titles = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {titles[0, :10]}")

Top 3 recommendations for user 42: [b'3326' b'3326' b'3326' b'4482' b'4482' b'4482' b'4565' b'4565' b'3120'
 b'3120']


# ranking stage

In [63]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [64]:
ratings = features_ds.map(lambda x: {
    "movie_id": x["movieId"],
    "user_id": x["userId"],
    "user_rating": x["rating"]
})

In [65]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)