In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
import collections

In [2]:
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    '../data/ml-100k/ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    '../data/ml-100k/ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    '../data/ml-100k/ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

# Since the ids start at 1, we shift them to start at 0.
users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: str(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

# Since some movies can belong to more than one genre, we create different
# 'genre' columns as follows:
# - all_genres: all the active genres of the movie.
# - genre: randomly sampled from the active genres.
def mark_genres(movies, genres):
    def get_random_genre(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        if len(active) == 0:
            return 'Other'
        return np.random.choice(active)
    def get_all_genres(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        if len(active) == 0:
            return 'Other'
        return '-'.join(active)
    movies['genre'] = [
          get_random_genre(gs) for gs in zip(*[movies[genre] for genre in genres])]
    movies['all_genres'] = [
          get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]

mark_genres(movies, genre_cols)

# Create one merged DataFrame containing all the movielens data.
movielens = ratings.merge(movies, on='movie_id').merge(users, on='user_id')

In [3]:
# years = movies['year'].unique()
# years = np.delete(years, np.where(years=='nan'))
# years = years.astype(np.float)

# # Filling nan values
# movies["year"] = movies["year"].apply(lambda s: "1992" if s=="nan" else s)
# movies["year"] = movies["year"].astype(np.float)

rated_movies = ratings[["movie_id", "user_id"]].groupby(["user_id"], as_index=False).aggregate(lambda x: list(x))
rated_movies.head(5)

Unnamed: 0,user_id,movie_id
0,0,"[60, 188, 32, 159, 19, 201, 170, 264, 154, 116..."
1,1,"[291, 250, 49, 313, 296, 289, 311, 280, 12, 27..."
2,10,"[110, 557, 731, 226, 424, 739, 722, 37, 724, 1..."
3,100,"[828, 303, 595, 221, 470, 404, 280, 251, 281, ..."
4,101,"[767, 822, 69, 514, 523, 321, 624, 160, 447, 4..."


In [4]:
years_dict = {
    movie: year for movie, year in zip(movies["movie_id"], movies["year"])
}
genres_dict = {
    movie: genres.split('-')
    for movie, genres in zip(movies["movie_id"], movies["all_genres"])
}
def make_batch(ratings, batch_size):
    """Creates a batch of examples.
    Args:
    ratings: A DataFrame of ratings such that examples["movie_id"] is a list of
      movies rated by a user.
    batch_size: The batch size.
    """
    def pad(x, fill):
        return pd.DataFrame.from_dict(x).fillna(fill).values

    movie = []
    year = []
    genre = []
    label = []
    for movie_ids in ratings["movie_id"].values:
        movie.append(movie_ids)
        genre.append([x for movie_id in movie_ids for x in genres_dict[movie_id]])
        year.append([years_dict[movie_id] for movie_id in movie_ids])
        label.append([int(movie_id) for movie_id in movie_ids])
    features = {
      "movie_id": pad(movie, ""),
      "year": pad(year, ""),
      "genre": pad(genre, ""),
      "label": pad(label, -1)
    }
    
    batch = (
      tf.data.Dataset.from_tensor_slices(features)
      .shuffle(1000)
      .repeat()
      .batch(batch_size)
#       .make_one_shot_iterator()
#       .get_next()
    )
    return features

def select_random(x):
    """Selectes a random elements from each row of x."""
    def to_float(x):
        return tf.cast(x, tf.float32)
    def to_int(x):
        return tf.cast(x, tf.int64)
    batch_size = tf.shape(x)[0]
    rn = tf.range(batch_size)
    nnz = to_float(tf.math.count_nonzero(x >= 0, axis=1))
    rnd = tf.random.uniform([batch_size])
    ids = tf.stack([to_int(rn), to_int(nnz * rnd)], axis=1)
    return to_int(tf.gather_nd(x, ids))

In [5]:
def split_dataframe(df, holdout_fraction=0.1):
    """Splits a DataFrame into training and test sets.
    Args:
    df: a dataframe.
    holdout_fraction: fraction of dataframe rows to use in the test set.
    Returns:
    train: dataframe for training
    test: dataframe for testing
    """
    test = df.sample(frac=holdout_fraction, replace=False)
    train = df[~df.index.isin(test.index)]
    return train, test

In [6]:
def make_embedding_col(key, embedding_dim):
    categorical_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=key, vocabulary_list=list(set(movies[key].values)), num_oov_buckets=0)
    return tf.feature_column.embedding_column(
        categorical_column=categorical_col, dimension=embedding_dim,
        # default initializer: trancated normal with stddev=1/sqrt(dimension)
        combiner='mean')

In [7]:
def softmax_loss(user_embeddings, movie_embeddings, labels):
    """Returns the cross-entropy loss of the softmax model.
    Args:
    user_embeddings: A tensor of shape [batch_size, embedding_dim].
    movie_embeddings: A tensor of shape [num_movies, embedding_dim].
    labels: A tensor of [batch_size], such that labels[i] is the target label
      for example i.
    Returns:
    The mean cross-entropy loss.
    """
    # Verify that the embddings have compatible dimensions
    user_emb_dim = user_embeddings.shape[1]
    movie_emb_dim = movie_embeddings.shape[1]
    if user_emb_dim != movie_emb_dim:
        raise ValueError(
            "The user embedding dimension %d should match the movie embedding "
            "dimension % d" % (user_emb_dim, movie_emb_dim))

    logits = tf.matmul(user_embeddings, movie_embeddings, transpose_b=True)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
      logits=logits, labels=labels))
    return loss

In [8]:
tfds = make_batch(rated_movies, 300)

In [30]:
tfds

{'movie_id': array([['60', '188', '32', ..., '', '', ''],
        ['291', '250', '49', ..., '', '', ''],
        ['110', '557', '731', ..., '', '', ''],
        ...,
        ['46', '162', '516', ..., '', '', ''],
        ['3', '267', '78', ..., '', '', ''],
        ['343', '353', '267', ..., '', '', '']], dtype=object),
 'year': array([['1994', '1992', '1995', ..., '', '', ''],
        ['1997', '1997', '1977', ..., '', '', ''],
        ['1996', '1994', '1993', ..., '', '', ''],
        ...,
        ['1994', '1974', '1979', ..., '', '', ''],
        ['1995', '1997', '1993', ..., '', '', ''],
        ['1997', '1998', '1997', ..., '', '', '']], dtype=object),
 'genre': array([['Drama', 'Animation', 'Comedy', ..., '', '', ''],
        ['Drama', 'Comedy', 'Action', ..., '', '', ''],
        ['Comedy', 'Romance', 'Drama', ..., '', '', ''],
        ...,
        ['Comedy', 'Drama', 'Comedy', ..., '', '', ''],
        ['Action', 'Comedy', 'Drama', ..., '', '', ''],
        ['Drama', 'Comedy', '

In [34]:
tfds["movie_id"][0].__len__()

737

In [9]:
embedding_cols=[
          make_embedding_col("movie_id", 35),
          make_embedding_col("genre", 3),
          make_embedding_col("year", 2),
      ]

In [29]:
feature_layers = tf.keras.layers.DenseFeatures(embedding_cols, trainable=True)
initializer = tf.keras.initializers.TruncatedNormal(stddev=1./np.sqrt(35))
values = initializer(shape=(40,35))/10.
w = tf.Variable(values, trainable=None)
optimizer = tf.optimizers.Adagrad(learning_rate=8.)
trainable_weights = feature_layers.trainable_weights
# trainable_weights = feature_layers.trainable_weights
for i in range(500):
    with tf.GradientTape() as tape:
        train_rated_movies, test_rated_movies = split_dataframe(rated_movies)
        train_batch = make_batch(train_rated_movies, 200)
        test_batch = make_batch(test_rated_movies, 100)
        #train
        inputs = feature_layers(train_batch)
        outputs = tf.matmul(inputs, w)
        labels = select_random(train_batch["label"])
#         print(feature_layers.weights, feature_layers.with_name_scope)
        loss = softmax_loss(outputs, feature_layers.weights[1].numpy(), labels)
        
        #test
        inputs_test = feature_layers(test_batch)
        outputs_test = tf.matmul(inputs_test, w)
        labels_test = select_random(test_batch["label"])
        loss_test = softmax_loss(outputs_test, feature_layers.weights[1].numpy(), labels_test)
        
        if i%10==0:
            print("iter"+str(i)+"---loss: " + str(loss.numpy()) + "; test loss: " + str(loss_test.numpy()))
    grads = tape.gradient(loss, trainable_weights)
    optimizer.apply_gradients(zip(grads, trainable_weights))

iter0---loss: 7.4281826; test loss: 7.428577
iter10---loss: 7.427947; test loss: 7.4271917
iter20---loss: 7.427975; test loss: 7.4283676
iter30---loss: 7.4277005; test loss: 7.4290824
iter40---loss: 7.427827; test loss: 7.4280076


KeyboardInterrupt: 

In [14]:
iter(tfds)

<dict_keyiterator at 0x1b260456770>

In [15]:
initializer = tf.keras.initializers.TruncatedNormal(stddev=1./np.sqrt(35))
values = initializer(shape=(40,35))/10.

values

<tf.Tensor: shape=(40, 35), dtype=float32, numpy=
array([[ 0.00908667, -0.00389983,  0.00691063, ...,  0.01126931,
        -0.00636813, -0.00505206],
       [ 0.02162261, -0.00619991,  0.01322017, ...,  0.00556174,
        -0.03104937,  0.00985521],
       [ 0.00211462, -0.00412473,  0.00116795, ..., -0.03139753,
        -0.02429502,  0.03321886],
       ...,
       [-0.00427012,  0.01636573, -0.00585202, ...,  0.00477287,
        -0.00790855, -0.02028035],
       [ 0.0181758 , -0.01303392,  0.02147099, ..., -0.00164768,
        -0.01768864, -0.0032793 ],
       [ 0.00325453,  0.02362872,  0.01909943, ..., -0.00799608,
         0.02900949, -0.01466182]], dtype=float32)>

In [10]:
class CFModel(object):
    """Simple class that represents a collaborative filtering model"""
    def __init__(self, embedding_vars, loss, metrics=None):
        """Initializes a CFModel.
        Args:
          embedding_vars: A dictionary of tf.Variables.
          loss: A float Tensor. The loss to optimize.
          metrics: optional list of dictionaries of Tensors. The metrics in each
            dictionary will be plotted in a separate figure during training.
        """
        self._embedding_vars = embedding_vars
        self._loss = loss
        self._metrics = metrics
        self._embeddings = {k: None for k in embedding_vars}
        self._session = None

    @property
    def embeddings(self):
        """The embeddings dictionary."""
        return self._embeddings

    def train(self, num_iterations=100, learning_rate=1.0, plot_results=True,
            optimizer=tf.compat.v1.train.GradientDescentOptimizer):
        """Trains the model.
        Args:
          iterations: number of iterations to run.
          learning_rate: optimizer learning rate.
          plot_results: whether to plot the results at the end of training.
          optimizer: the optimizer to use. Default to GradientDescentOptimizer.
        Returns:
          The metrics dictionary evaluated at the last iteration.
        """
        with self._loss.graph.as_default():
            opt = optimizer(learning_rate)
            train_op = opt.minimize(self._loss)
            local_init_op = tf.group(
              tf.compat.v1.variables_initializer(opt.variables()),
              tf.compat.v1.local_variables_initializer())
            if self._session is None:
                self._session = tf.compat.v1.Session()
                with self._session.as_default():
                    self._session.run(tf.compat.v1.global_variables_initializer())
                    self._session.run(tf.compat.v1.tables_initializer())
                    tf.compat.v1.train.start_queue_runners()

        with self._session.as_default():
            local_init_op.run()
            iterations = []
            metrics = self._metrics or ({},)
            metrics_vals = [collections.defaultdict(list) for _ in self._metrics]
        # Train and append results.
        for i in range(num_iterations + 1):
            _, results = self._session.run((train_op, metrics))
            if (i % 10 == 0) or i == num_iterations:
                print("\r iteration %d: " % i + ", ".join(
                    ["%s=%f" % (k, v) for r in results for k, v in r.items()]),
                end='')
                iterations.append(i)
                for metric_val, result in zip(metrics_vals, results):
                    for k, v in result.items():
                        metric_val[k].append(v)

        for k, v in self._embedding_vars.items():
            self._embeddings[k] = v.eval()

#       if plot_results:
#         # Plot the metrics.
#         num_subplots = len(metrics)+1
#         fig = plt.figure()
#         fig.set_size_inches(num_subplots*10, 8)
#         for i, metric_vals in enumerate(metrics_vals):
#           ax = fig.add_subplot(1, num_subplots, i+1)
#           for k, v in metric_vals.items():
#             ax.plot(iterations, v, label=k)
#           ax.set_xlim([1, num_iterations])
#           ax.legend()
        return results

In [19]:
def build_softmax_model_2(rated_movies, embedding_cols, hidden_dims):
    def create_network(features):
        """Maps input features dictionary to user embeddings.
        Args:
          features: A dictionary of input string tensors.
        Returns:
          outputs: A tensor of shape [batch_size, embedding_dim].
        """
        # Create a bag-of-words embedding for each sparse feature.
        inputs = tf.compat.v1.feature_column.input_layer(features, embedding_cols)
        print(inputs)
        # Hidden layers.
        input_dim = inputs.shape[1]
        for i, output_dim in enumerate(hidden_dims):
#             w = tf.compat.v1.get_variable(
#               "hidden%d_w_" % i, shape=[input_dim, output_dim],
#               initializer=tf.compat.v1.truncated_normal_initializer(
#                   stddev=1./np.sqrt(output_dim))) / 10.
            initializer = tf.keras.initializers.TruncatedNormal(1./np.sqrt(output_dim))
            values = initializer(shape=[input_dim, output_dim])/10.
            w = tf.Variable(values, trainable=None)
            outputs = tf.matmul(inputs, w)
            input_dim = output_dim
            inputs = outputs
        return outputs

    train_rated_movies, test_rated_movies = split_dataframe(rated_movies)
    train_batch = make_batch(train_rated_movies, 200)
    test_batch = make_batch(test_rated_movies, 100)

    with tf.compat.v1.variable_scope("model", reuse=False):
        # Train
        train_user_embeddings = create_network(train_batch)
        train_labels = select_random(train_batch["label"])
    with tf.compat.v1.variable_scope("model", reuse=True):
        # Test
        test_user_embeddings = create_network(test_batch)
        test_labels = select_random(test_batch["label"])
        #     print("*"*80, inputs)
        movie_embeddings = tf.compat.v1.get_variable(
            "input_layer/movie_id_embedding/embedding_weights")

    test_loss = softmax_loss(
      test_user_embeddings, movie_embeddings, test_labels)
    train_loss = softmax_loss(
      train_user_embeddings, movie_embeddings, train_labels)
    _, test_precision_at_10 = tf.compat.v1.metrics.precision_at_k(
      labels=test_labels,
      predictions=tf.matmul(test_user_embeddings, movie_embeddings, transpose_b=True),
      k=10)

    metrics = (
      {"train_loss": train_loss, "test_loss": test_loss},
      {"test_precision_at_10": test_precision_at_10}
    )
    embeddings = {"movie_id": movie_embeddings}
    return CFModel(embeddings, train_loss, metrics)

In [21]:
def build_softmax_model(rated_movies, embedding_cols, hidden_dims):
    def create_network(features):
        """Maps input features dictionary to user embeddings.
        Args:
          features: A dictionary of input string tensors.
        Returns:
          outputs: A tensor of shape [batch_size, embedding_dim].
        """
        # Create a bag-of-words embedding for each sparse feature.
        inputs = tf.compat.v1.feature_column.input_layer(features, embedding_cols)
        print(inputs)
        # Hidden layers.
        input_dim = inputs.shape[1]
        for i, output_dim in enumerate(hidden_dims):
            w = tf.compat.v1.get_variable(
              "hidden%d_w_" % i, shape=[input_dim, output_dim],
              initializer=tf.compat.v1.truncated_normal_initializer(
                  stddev=1./np.sqrt(output_dim))) / 10.
            outputs = tf.matmul(inputs, w)
            input_dim = output_dim
            inputs = outputs
        return outputs

    train_rated_movies, test_rated_movies = split_dataframe(rated_movies)
    train_batch = make_batch(train_rated_movies, 200)
    test_batch = make_batch(test_rated_movies, 100)

    with tf.compat.v1.variable_scope("model", reuse=False):
        # Train
        train_user_embeddings = create_network(train_batch)
        train_labels = select_random(train_batch["label"])
    with tf.compat.v1.variable_scope("model", reuse=True):
        # Test
        test_user_embeddings = create_network(test_batch)
        test_labels = select_random(test_batch["label"])
        #     print("*"*80, inputs)
        movie_embeddings = tf.compat.v1.get_variable(
            "input_layer/movie_id_embedding/embedding_weights")

    test_loss = softmax_loss(test_user_embeddings, movie_embeddings, test_labels)
    train_loss = softmax_loss(train_user_embeddings, movie_embeddings, train_labels)
    _, test_precision_at_10 = tf.compat.v1.metrics.precision_at_k(
          labels=test_labels,
          predictions=tf.matmul(test_user_embeddings, movie_embeddings, transpose_b=True),
          k=10)

    metrics = (
      {"train_loss": train_loss, "test_loss": test_loss},
      {"test_precision_at_10": test_precision_at_10}
    )
    embeddings = {"movie_id": movie_embeddings}
    return CFModel(embeddings, train_loss, metrics)

In [22]:
with tf.Graph().as_default():
  softmax_model = build_softmax_model(
      rated_movies,
      embedding_cols=[
          make_embedding_col("movie_id", 35),
          make_embedding_col("genre", 3),
          make_embedding_col("year", 2),
      ],
      hidden_dims=[35])

softmax_model.train(
    learning_rate=8., num_iterations=1, optimizer=tf.compat.v1.train.AdagradOptimizer)

Tensor("model/input_layer/concat:0", shape=(849, 40), dtype=float32)
Tensor("model_1/input_layer/concat:0", shape=(94, 40), dtype=float32)
 iteration 1: train_loss=7.428047, test_loss=7.429975, test_precision_at_10=0.000000

RuntimeError: Trying to eval in EAGER mode

In [26]:
tf.compat.v1.train.AdagradOptimizer?