In [1]:
import pandas as pd
import numpy as np
import sys
from timeit import default_timer as timer

In [2]:
import data

In [3]:
data.train

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [4]:
data.test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
train = pd.read_csv('./netflix_train.csv', index_col=False)
test = pd.read_csv('./netflix_test.csv', index_col=False)

In [6]:
train.drop(['Unnamed: 0'], axis=1, inplace = True)
test.drop(['Unnamed: 0'], axis=1, inplace = True)

In [7]:
train = train.pivot_table(train, index = 'User_ID', columns = 'Movie_ID')

In [8]:
test = test.pivot_table(test, index = 'User_ID', columns = 'Movie_ID')

In [9]:
train.fillna(0.0, inplace = True)
test.fillna(0.0, inplace = True)

In [10]:
test.head()

Unnamed: 0_level_0,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating
Movie_ID,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
User_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
test = test.to_numpy()

In [15]:
test.shape

(143458, 1350)

In [16]:
train = train.to_numpy()

In [17]:
class MatrixFactorizaion():
    def __init__(self, train, test, k, learning_rate, reg_param, epochs, verbose = False):
        self.R = train
        self.test = test
        self.num_users, self.num_items = train.shape
        self.k = k
        self.learning_rate = learning_rate
        self.reg_param = reg_param
        self.epochs = epochs
        self.verbose = verbose
        
    def fit(self):
        #initialize latent mat
        self._P = np.random.normal(scale = 1./self.k, size = (self.num_users, self.k))
        self._Q = np.random.normal(scale = 1./self.k, size = (self.num_items, self.k))
        
        #initialize biases
        self._b_P = np.zeros(self.num_users)
        self._b_Q = np.zeros(self.num_items)
        self._b = np.mean(self.R[np.where(self.R != 0)])
        
        self.training_epochs = []
        start = timer()
        
        for epoch in range(self.epochs):
            
            for u in range(self.num_users):
                for i in range(self.num_items):
                    if self.R[u,i] > 0:
                        self.gradient_descent(u,i, self.R[u,i])
                        
                
            train_cost, test_cost = self.cost()
            self.training_epochs.append((epoch, train_cost, test_cost))
            
            if self.verbose == True and ((epoch + 1) % 10 == 0):
                print('iter: %d, train_cost = %.4f, test_cost = %.4f' % (epoch+1, train_cost, test_cost))
                
        print('time: %.4f seconds' % (timer()-start))
    
    #RMSE computation
    def cost(self):
        xi, yi = self.R.nonzero()
        test_x, test_y = self.test.nonzero()
        predicted = self.get_complete_matrix()
        cost_train = 0; cost_test = 0
        
        for x, y in zip(xi,yi):
            cost_train += pow(self.R[x,y] - predicted[x,y], 2)
        for i,j in zip(test_x, test_y):
            cost_test += pow(self.test[i,j] - predicted[i,j], 2)
            
        return np.sqrt(cost_train/len(xi)), np.sqrt(cost_test/len(test_x))
    
    def gradient(self, error, u, i):
        dp = (error * self._Q[i,:]) - (self.reg_param * self._P[u,:])
        dq = (error * self._P[u,:]) - (self.reg_param * self._Q[i,:])
        return dp, dq
    
    def gradient_descent(self, u, i, rating):
        prediction = self.get_prediction(u,i)
        error = rating - prediction
        
        self._b_P[u] += self.learning_rate * (error - self.reg_param * self._b_P[u])
        self._b_Q[i] += self.learning_rate * (error - self.reg_param * self._b_Q[i])
        
        dp, dq = self.gradient(error, u, i)
        self._P[u,:] += self.learning_rate * dp
        self._Q[i,:] += self.learning_rate * dq
        
    def get_prediction(self, u, i):
        return self._b + self._b_P[u] + self._b_Q[i] + self._P[u,:].dot(self._Q[i,:].T)
    
    def get_complete_matrix(self):
        return self._b + self._b_P[:, np.newaxis] + self._b_Q[np.newaxis, :] + self._P.dot(self._Q.T)
    
    def print_result(self):
        print('final R predicted: \n')
        print(self.get_complete_matrix())
        print('final RMSE:')
        print(self.training_epochs[self.epochs-1][2])

In [18]:
np.random.seed(7)

factorize = MatrixFactorizaion(train, test, k=40, learning_rate=0.01, reg_param=0.01, epochs=10, verbose=True)
factorize.fit()
factorize.print_result()

KeyboardInterrupt: 