In [174]:
import time

import pandas as pd
import numpy as np
import torch
import pyro

from numpy.linalg import inv

In [57]:
ratings = pd.read_csv('../data/the-movies-dataset/ratings_small.csv')

In [14]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [60]:
unique_userId = ratings.userId.unique()
unique_movieId = ratings.movieId.unique()

ratings['new_user_index'], ratings['new_movie_index'] = 0, 0

for old_id, new_id in zip(unique_userId, range(len(unique_userId))):
    ratings['new_user_index'].iloc[ratings[ratings['userId']==old_id].index.tolist()] = new_id

for old_id, new_id in zip(unique_movieId, range(len(unique_movieId))):
    ratings['new_movie_index'].iloc[ratings[ratings['movieId']==old_id].index.tolist()] = new_id

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,userId,movieId,rating,timestamp,new_user_index,new_movie_index
0,1,31,2.5,1260759144,0,0
1,1,1029,3.0,1260759179,0,1
2,1,1061,3.0,1260759182,0,2
3,1,1129,2.0,1260759185,0,3
4,1,1172,4.0,1260759205,0,4
5,1,1263,2.0,1260759151,0,5
6,1,1287,2.0,1260759187,0,6
7,1,1293,2.0,1260759148,0,7
8,1,1339,3.5,1260759125,0,8
9,1,1343,2.0,1260759131,0,9


In [None]:
print(ratings.head())

In [33]:
# Defining Parameters/Hyperparameters
I = ratings.userId.nunique()
J = ratings.movieId.nunique()
data_var = ratings['rating'].var(axis=0)
k = 3
mean = 0
std = 1

In [101]:
u = pyro.sample("user_factor", pyro.distributions.Normal(mean, std), sample_shape=torch.Size([I,k]))
v = pyro.sample("movie_factor", pyro.distributions.Normal(mean, std), sample_shape=torch.Size([J,k]))

In [118]:
user_to_movie_dict = {}
movie_to_user_dict = {}

for i in range(I):
    user_to_movie_dict[i] = ratings[ratings['new_user_index']==i]['new_movie_index'].tolist()
for j in range(J):
    movie_to_user_dict[j] = ratings[ratings['new_movie_index']==j]['new_user_index'].tolist()

In [177]:
# Update of all u_i: Takes 110.70313000679016 seconds
for i in range(len(u)):
    v_sum = 0
    mv_sum = 0
    for j in user_to_movie_dict[i]:
        v_sum += np.outer(v[j],v[j])
        mv_sum += float(ratings[(ratings['new_user_index']==i) & (ratings['new_movie_index']==j)].rating) * v[j]
    u[i] = torch.from_numpy(np.dot(inv(std * data_var * np.identity(k) + v_sum), mv_sum))

# Update of all v_j: Takes 106.21870613098145 seconds
for j in range(len(v)):
    u_sum = 0
    mu_sum = 0
    for i in movie_to_user_dict[j]:
        u_sum += np.outer(u[i],u[i])
        mu_sum += float(ratings[(ratings['new_user_index']==i) & (ratings['new_movie_index']==j)].rating) * u[i]
    v[j] = torch.from_numpy(np.dot(inv(std * data_var * np.identity(k) + u_sum), mu_sum))

M_prime = torch.mm(u,v.T)
error = 0
for i in range(len(u)):
    for j in user_to_movie_dict[i]:
        error += (float(ratings[(ratings['new_user_index']==i) & (ratings['new_movie_index']==j)].rating) - M_prime[i][j])**2
print('MSE is of Epoch ' + str(epoch_num) + ': ' + str(float(error/len(ratings))))

110.70313000679016
106.21870613098145


In [None]:
class PMF:
    def __init__(self, data, k, prior_mean=0, prior_std=1, epochs=10):
        self.ratings = data
        self.k = k
        self.prior_mean = prior_mean
        self.prior_std = prior_std
        self.u = pyro.sample("user_factor", pyro.distributions.Normal(prior_mean, prior_std), sample_shape=torch.Size([I,k]))
        self.v = pyro.sample("movie_factor", pyro.distributions.Normal(prior_mean, prior_std), sample_shape=torch.Size([J,k]))
        self.epochs = epochs
        self.I = ratings.userId.nunique()
        self.J = ratings.movieId.nunique()
        self.data_var = ratings['rating'].var(axis=0)
        
    def fit(self):
        user_to_movie_dict = {}
        movie_to_user_dict = {}

        for i in range(I):
            user_to_movie_dict[i] = self.ratings[self.ratings['new_user_index']==i]['new_movie_index'].tolist()
        for j in range(J):
            movie_to_user_dict[j] = self.ratings[self.ratings['new_movie_index']==j]['new_user_index'].tolist()
            
        for epoch_num in range(self.epochs):
            # Update of all u_i: Takes 110.70313000679016 seconds
            for i in range(len(self.u)):
                v_sum = 0
                mv_sum = 0
                for j in user_to_movie_dict[i]:
                    v_sum += np.outer(self.v[j],self.v[j])
                    mv_sum += float(self.ratings[(self.ratings['new_user_index']==i) & (self.ratings['new_movie_index']==j)].rating) * self.v[j]
                self.u[i] = torch.from_numpy(np.dot(inv(self.std * self.data_var * np.identity(self.k) + v_sum), mv_sum))

            # Update of all v_j: Takes 106.21870613098145 seconds
            for j in range(len(self.v)):
                u_sum = 0
                mu_sum = 0
                for i in movie_to_user_dict[j]:
                    u_sum += np.outer(self.u[i],self.u[i])
                    mu_sum += float(self.ratings[(self.ratings['new_user_index']==i) & (self.ratings['new_movie_index']==j)].rating) * self.u[i]
                v[j] = torch.from_numpy(np.dot(inv(self.std * self.data_var * np.identity(self.k) + u_sum), mu_sum))

            M_prime = torch.mm(self.u,self.v.T)
            error = 0
            for i in range(len(self.u)):
                for j in user_to_movie_dict[i]:
                    error += (float(self.ratings[(self.ratings['new_user_index']==i) & (self.ratings['new_movie_index']==j)].rating) - M_prime[i][j])**2
            print('MSE is of Epoch ' + str(epoch_num) + ': ' + str(float(error/len(self.ratings))))

In [None]:
model = PMF(ratings, 3)

In [None]:
model.fit()