In [3]:
# downloaded from https://grouplens.org/datasets/movielens/25m/

In [5]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import time
from tqdm import tqdm
dbg = True
# from google.colab import drive
# drive.mount('/content/gdrive')
nrows = {True:100000, False:None}[dbg]
rating_df = pd.read_csv('ml-25m/ratings.csv', nrows=nrows)
tag_df = pd.read_csv('ml-25m/tags.csv', nrows=nrows)
movie_df = pd.read_csv('ml-25m/movies.csv')
rating_df['userId'] = rating_df['userId'] - 1
rating_df['movieId'] = rating_df['movieId'] - 1
tag_df['userId'] = tag_df['userId'] - 1
tag_df['movieId'] = tag_df['movieId'] - 1
movie_df['movieId'] = movie_df['movieId'] - 1

rating_df['movieId'] = rating_df['movieId'].map({v: i for i, v in enumerate(rating_df['movieId'].unique())})
mismatch = set(rating_df["movieId"]).difference(movie_df["movieId"])
rating_df = rating_df.loc[~rating_df['movieId'].isin(mismatch)]

movie_counts = rating_df['movieId'].value_counts()
popular_movies = movie_counts[(movie_counts > 30)].index
rating_df = rating_df.loc[rating_df['movieId'].isin(popular_movies)]
user_counts = rating_df['userId'].value_counts()
popular_users = user_counts[(user_counts > 30)].index
rating_df = rating_df.loc[rating_df['userId'].isin(popular_users)]

movie_df = movie_df.loc[movie_df['movieId'].isin(rating_df['movieId'].unique())]
movie_id_map = {v:k for k, v in enumerate(rating_df['movieId'].unique())}
movie_df['movieId'] = movie_df['movieId'].map(movie_id_map).astype(int)
rating_df['movieId'] = rating_df['movieId'].map(movie_id_map).astype(int)
user_id_map = {v:k for k, v in enumerate(rating_df['userId'].unique())}
rating_df['userId'] = rating_df['userId'].map(user_id_map).astype(int)
rating_df = rating_df.reset_index(drop=True)
movie_df = movie_df.reset_index(drop=True)
mismatch = set(movie_df["movieId"]).difference(rating_df["movieId"])
movie_df = movie_df.drop(mismatch)
num_users = rating_df['userId'].nunique()
num_movies = rating_df['movieId'].nunique()

In [6]:
train_rows=np.random.choice(rating_df.shape[0], int(0.8*rating_df.shape[0]), replace=False)
ratings_train = rating_df.loc[train_rows]
test_rows = set(range(rating_df.shape[0])).difference(set(train_rows))
ratings_test = rating_df.loc[test_rows]

In [7]:
minrating, maxrating = rating_df['rating'].describe().loc[['min', 'max']]

In [32]:
class RecEngine:
    def __init__(self, ratings_train, ratings_test, num_users, num_movies, minrating, maxrating, emb_size=8,
                 early_stopping = False, chk_freq = 10, *args, **kwargs):
        tf.compat.v1.reset_default_graph()
        self.user_vec = tfvar("uservec", shape=(num_users, emb_size), dtype=tf.float32)
        self.movie_vec = tfvar("movievec", shape=(num_movies, emb_size), dtype=tf.float32)
        self.user_bias_vec = tfvar("userbiasvec", shape=(num_users), dtype=tf.float32)
        self.movie_bias_vec = tfvar("moviebiasvec", shape=(num_movies), dtype=tf.float32)
        self.ratings_train = ratings_train
        self.ratings_test = ratings_test

        self.user_ph = tfph(tf.int32, shape=(None))
        self.movie_ph = tfph(tf.int32, shape=(None))
        self.rating_ph = tfph(tf.float32, shape=(None))
        
        self.losses = []
        self.chk_freq = chk_freq
        self.val_losses = []
        self.early_stopping = early_stopping
        if early_stopping and chk_frq == 0:
            raise ValueError('need a check freq if using early stopping')
        self.n_trn = ratings_train.shape[0]
        
    def sample_at(self, rows):

        samples = self.ratings_train.iloc[rows]
        fd = {self.movie_ph: samples['movieId'], self.user_ph: samples['userId'], self.rating_ph: samples['rating']}
        return fd
    def get_fd_train(self):
        fd = {self.movie_ph: self.ratings_train['movieId'], 
             self.user_ph: self.ratings_train['userId'], 
             self.rating_ph: self.ratings_train['rating']}
        return fd
    def get_fd_test(self):
        fd =  {self.movie_ph: self.ratings_test['movieId'], 
                self.user_ph: self.ratings_test['userId'], 
                self.rating_ph: self.ratings_test['rating']}
        return fd
    def train(self, epochs = 100, minibatch = True, batch_size = 64):
        for ep in range(epochs):
            if minibatch:
                data_order = np.arange(self.n_trn)
                np.random.shuffle(data_order)
                num_batches = n_trn // batch_size
                loss_avg = 0
                for batch_idx in range(num_batches):
                    if batch_idx != num_batches - 1:
                        rows = data_order[batch_idx*batch_size: (batch_idx+1) * batch_size]
                    else:
                        rows = data_order[batch_idx*batch_size:]
                    fd = self.sample_at(rows)
                    current_loss, _ = self.sess.run([self.loss, self.opt], fd)
                    loss_avg += current_loss * len(rows) / n_trn
                self.losses.append(loss_avg)

            else:
                fd = self.get_fd_train()
                current_loss, _ = self.sess.run([self.loss, self.opt], fd)
                self.losses.append(current_loss)

            if early_stopping:
                if ep % chk_freq == 0:
                    last_chkpnt = f'rec_tst_{ep}'
                    self.saver.save(self.sess, last_chkpnt)
            fd_test = self.get_fd_test()
            val_loss = self.sess.run(self.loss,fd_test)

            self.val_losses.append(val_loss)
            if early_stopping:
                if all([los > val_losses[ep - (ep % 10)] for los in self.val_losses[-3:]]):
                    self.saver.restore(self.sess, last_chkpnt)
                    fd = self.get_fd_test()
                    restored_loss = self.sess.run(self.loss, fd)
                    self.val_losses.append(restored_loss)
                    break

In [33]:
class MF(RecEngine):
    def __init__(self, minrating, maxrating, *args, **kwargs):
        super().__init__(minrating=minrating, maxrating=maxrating, *args, **kwargs)
#         tf.compat.v1.reset_default_graph()
#         self.user_vec = tfvar("uservec", shape=(num_users, emb_size), dtype=tf.float32)
#         self.movie_vec = tfvar("movievec", shape=(num_movies, emb_size), dtype=tf.float32)
#         self.user_bias_vec = tfvar("userbiasvec", shape=(num_users), dtype=tf.float32)
#         self.movie_bias_vec = tfvar("moviebiasvec", shape=(num_movies), dtype=tf.float32)
#         self.ratings_train = ratings_train

#         self.user_ph = tfph(tf.int32, shape=(None))
#         self.movie_ph = tfph(tf.int32, shape=(None))
#         self.rating_ph = tfph(tf.float32, shape=(None))

        # batch size, emb_size
        self.user_emb = tf.gather(self.user_vec, self.user_ph)
        self.movie_emb = tf.gather(self.movie_vec, self.movie_ph)

        # batch size
        self.user_bias_emb = tf.gather(self.user_bias_vec, self.user_ph)
        self.movie_bias_emb = tf.gather(self.movie_bias_vec, self.movie_ph)

        self.score_raw = (
            tf.reduce_sum(self.user_emb * self.movie_emb, axis=1) + self.user_bias_emb + self.movie_bias_emb
        )
        self.score = self.score_raw * (maxrating - minrating) + minrating

        self.reg = tf.reduce_mean(tf.square(self.user_emb)) + tf.reduce_mean(tf.square(self.movie_emb))

        self.mse = tf.reduce_mean(tf.square(self.score - self.rating_ph))
        self.loss = self.mse + self.reg
        self.opt_fcn = tf.compat.v1.train.AdamOptimizer()
        self.opt = self.opt_fcn.minimize(self.loss)
        self.sess = tf.compat.v1.Session()
        self.sess.run(tf.compat.v1.global_variables_initializer())
        self.saver = tf.compat.v1.train.Saver()

In [34]:
tfph = tf.compat.v1.placeholder
tfvar = tf.compat.v1.get_variable
tf.compat.v1.disable_eager_execution()
mf_mdl = MF(ratings_train=ratings_train, num_users=num_users, num_movies=num_movies,
            minrating=minrating, maxrating=maxrating, emb_size=8)

In [35]:
mf_mdl.train()

AttributeError: 'MF' object has no attribute 'ratings_test'

In [19]:
epochs = 200
losses = []
chk_freq = 10
val_losses = []
early_stopping = False
batch_size = 64
minibatch = 1
n_trn = ratings_train.shape[0]
for ep in range(epochs):
    if minibatch:
        data_order = np.arange(n_trn)
        np.random.shuffle(data_order)
        num_batches = n_trn // batch_size
        loss_avg = 0
        for batch_idx in range(num_batches):
            if batch_idx != num_batches - 1:
                rows = data_order[batch_idx*batch_size: (batch_idx+1) * batch_size]
            else:
                rows = data_order[batch_idx*batch_size:]
            samples = ratings_train.iloc[rows]
            
            current_loss, _ = sess.run([loss, opt], {movie_ph: samples['movieId'], 
                                             user_ph: samples['userId'], 
                                             rating_ph: samples['rating']})
            loss_avg += current_loss * len(rows) / n_trn
        losses.append(loss_avg)
        
    else:
        current_loss, _ = sess.run([loss, opt], {movie_ph: ratings_train['movieId'], 
                                             user_ph: ratings_train['userId'], 
                                             rating_ph: ratings_train['rating']})
        losses.append(current_loss)
    
    if early_stopping:
        if ep % chk_freq == 0:
            last_chkpnt = f'rec_tst_{ep}'
            saver.save(sess, last_chkpnt)
    
    val_loss = sess.run(loss, {
        movie_ph: ratings_test['movieId'], 
        user_ph: ratings_test['userId'], 
        rating_ph: ratings_test['rating']})
    
    val_losses.append(val_loss)
    if early_stopping:
        if all([los > val_losses[ep - (ep % 10)] for los in val_losses[-3:]]):
            saver.restore(sess, last_chkpnt)
            restored_loss = sess.run(loss, {
                        movie_ph: ratings_test['movieId'], 
                        user_ph: ratings_test['userId'], 
                        rating_ph: ratings_test['rating']})
            val_losses.append(restored_loss)
            break

NameError: name 'sess' is not defined

In [None]:
plt.plot(losses, label='train')
plt.plot(val_losses, label='validation')
plt.legend()

In [None]:
sess.run(user_emb, {movie_ph: ratings_train['movieId'], 
                                             user_ph: ratings_train['userId'], 
                                             rating_ph: ratings_train['rating']})