In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [3]:
path = '/content/gdrive/MyDrive/colab notebook/data/movielens/'
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')

print(ratings_df.shape)
print(ratings_df.head())

(100836, 4)
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [5]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)

# for simplicity, choose 1000 sample
train_df = train_df[:1000]

In [19]:
# make sparse matrix
sparse_matrix = train_df.groupby('movieId').apply(lambda x: pd.Series(x['rating'].values, index=x['userId'])).unstack()
sparse_matrix.index.name = 'movieId'
sp = sparse_matrix.copy()
sp = sp.fillna(0, inplace = False)
# fill sparse matrix with average of movie ratings
sparse_matrix_withmovie = sparse_matrix.apply(lambda x: x.fillna(x.mean()), axis=1)
sparse_matrix_withmovie

userId,1,2,4,5,6,7,10,15,16,17,...,600,601,602,603,604,605,606,607,608,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.25,4.25,4.25,4.25,4.25,4.25,4.25,4.25,4.25,4.25,...,4.25,4.25,4.25,4.25,3.0,4.25,4.25,4.25,4.25,4.25
3,3.00,3.00,3.00,3.00,3.00,3.00,3.00,3.00,3.00,3.00,...,3.00,3.00,3.00,3.00,3.0,3.00,3.00,3.00,3.00,3.00
5,3.00,3.00,3.00,3.00,3.00,3.00,3.00,3.00,3.00,3.00,...,3.00,3.00,3.00,3.00,3.0,3.00,3.00,3.00,3.00,3.00
6,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00,...,4.00,4.00,4.00,4.00,4.0,4.00,4.00,4.00,4.00,4.00
9,1.50,1.50,1.50,1.50,1.50,1.50,1.50,1.50,1.50,1.50,...,1.50,1.50,1.50,1.50,1.5,1.50,1.50,1.50,1.50,1.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165969,4.50,4.50,4.50,4.50,4.50,4.50,4.50,4.50,4.50,4.50,...,4.50,4.50,4.50,4.50,4.5,4.50,4.50,4.50,4.50,4.50
173291,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.00,2.00,2.00,2.00,2.0,2.00,2.00,2.00,2.00,2.00
174055,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00,...,4.00,4.00,4.00,4.00,4.0,4.00,4.00,4.00,4.00,4.00
176371,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00,...,4.00,4.00,4.00,4.00,4.0,4.00,4.00,4.00,4.00,4.00


In [20]:
# to numpy
sp = sp.to_numpy()
sparse_matrix_withmovie = sparse_matrix_withmovie.to_numpy()
sparse_matrix_withmovie

array([[4.25, 4.25, 4.25, ..., 4.25, 4.25, 4.25],
       [3.  , 3.  , 3.  , ..., 3.  , 3.  , 3.  ],
       [3.  , 3.  , 3.  , ..., 3.  , 3.  , 3.  ],
       ...,
       [4.  , 4.  , 4.  , ..., 4.  , 4.  , 4.  ],
       [4.  , 4.  , 4.  , ..., 4.  , 4.  , 4.  ],
       [2.  , 2.  , 2.  , ..., 2.  , 2.  , 2.  ]])

In [23]:
A = np.array([[2,3,4], [1,4,5]])

np.sum(A, where = A>3)

13

In [28]:
# MF class

class MF():
  def __init__(self, R, D, lr, lambd, iterations):

      self.R = R
      self.num_users, self.num_items = R.shape
      self.D = D
      self.lr = lr
      self.lambd = lambd
      self.iterations = iterations
      # user, item matrix
      self.U = np.random.normal(scale=1./self.D, size=(self.num_users, self.D))
      self.I = np.random.normal(scale=1./self.D, size=(self.num_items, self.D))
      # user bias and item bias (to be learned)
      self.b_u = np.zeros(self.num_users)
      self.b_i = np.zeros(self.num_items)
      # overall mean
      self.b = np.mean(self.R[np.where(self.R != 0)])
      self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0]
  def get_rating(self, i, j):
      return self.b + self.b_u[i] + self.b_i[j] + self.U[i, :].dot(self.I[j, :].T)

  def full_matrix(self):
      return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.U.dot(self.I.T)

  def mse(self):
        """
        A function to compute the total mean square error
        """
        predicted = self.full_matrix()
        return np.sum((self.R-predicted)*(self.R-predicted), where = self.R > 0 )

  def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            self.b_u[i] += self.lr * (e - self.lambd * self.b_u[i])
            self.b_i[j] += self.lr * (e - self.lambd * self.b_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            U_i = self.U[i, :][:]
            
            # Update user and item latent feature matrices
            self.U[i, :] += self.lr * (e * self.I[j, :] - self.lambd * self.U[i,:])
            self.I[j, :] += self.lr * (e * U_i - self.lambd * self.I[j,:])  


  def train(self):

        
      # Perform stochastic gradient descent for number of iterations
      training_process = []
      for i in range(self.iterations):
          np.random.shuffle(self.samples)
          self.sgd()
          mse = self.mse()
          training_process.append((i, mse))
          # if (i+1) % 10 == 0:
          print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
      return training_process

In [29]:
mf = MF(sp, D=50, lr=0.1, lambd=0.01, iterations=20)
training_process = mf.train()

Iteration: 1 ; error = 634.1206
Iteration: 2 ; error = 459.8092
Iteration: 3 ; error = 351.6443
Iteration: 4 ; error = 275.5865
Iteration: 5 ; error = 218.1775
Iteration: 6 ; error = 172.7943
Iteration: 7 ; error = 135.8451
Iteration: 8 ; error = 105.2319
Iteration: 9 ; error = 80.9861
Iteration: 10 ; error = 62.0284
Iteration: 11 ; error = 47.3033
Iteration: 12 ; error = 36.3019
Iteration: 13 ; error = 27.9422
Iteration: 14 ; error = 21.6736
Iteration: 15 ; error = 16.8893
Iteration: 16 ; error = 13.2647
Iteration: 17 ; error = 10.4806
Iteration: 18 ; error = 8.3626
Iteration: 19 ; error = 6.7197
Iteration: 20 ; error = 5.4643
