# Matrix Factorization Collaborative Filtering

### import dataset

In [18]:
#this notebook follows the tutorial written here: https://medium.com/analytics-vidhya/matrix-factorization-as-a-recommender-system-727ee64683f0
#author's original notebook can be found here: https://medium.com/analytics-vidhya/matrix-factorization-as-a-recommender-system-727ee64683f0

In [19]:
#import required libraries
import pandas as pd
import numpy as np
data = pd.read_csv('datasets/MF dataset.csv')
data.head()


Unnamed: 0,3704,1924,4837,867,2631,5410,1733,3536,780,3312,...,207,5817,4097,4755,3977,1520,4622,2160,2542,1535
0,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,3,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,4,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,4,0,0,0,0,4,4,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,5,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#reduce the dataset a bit
data = data[data.columns[:1000]].loc[range(1,1001)]
data.head()

Unnamed: 0,3704,1924,4837,867,2631,5410,1733,3536,780,3312,...,1173,3940,1599,1140,1851,4556,4363,864,4026,5841
1,0,3,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,4,0
2,0,0,4,0,0,0,5,0,0,0,...,0,0,4,0,0,5,5,4,0,0
3,0,0,0,4,0,0,0,0,4,4,...,0,4,0,5,5,3,0,0,3,5
4,0,0,0,0,5,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,5
5,0,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### implementation of matrix factorization

In [4]:
#import required libraries
import numpy as np
import pandas as pd
from progressbar import progressbar

#### define matrix factorization function

In [15]:
#R is matrix holding true values, P and Q are the two matrix factors of R, P represents # of columns of P and # of rows of Q
def matrix_factorization(R, P, Q, K, steps=1000, alpha=0.0002, beta=0.02):
    Q = Q.T 
    #Q will be same dimension as P when transposed
    
    #set up iterations, progressbar is handy for tracking for loops that take a while
    for step in progressbar(range(steps)):
        #runs script through number of 'steps', which is # times program runs through process of improvement
        for i in range(len(R)):
            for j in range(len(R[i])):
                #i and j track row num and colum num, respectively. can be used to access any cell in matrix
                if R[i][j]>0: #ensures value is valid
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    #var eij is the difference between true value and prediction - which is achieved by multiplying 
                    #ith row of P and jth row of Q
                    
                    #now perform updating with a simplified gradient descent formula
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
                        
        eR = np.dot(P,Q)
        #need to calculate error to see how well predictions are doing
        e = 0 #error variable 
        
        #calculates the mean squared error for each value and saves it to error variable e
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:], Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j], 2))
        if e < 0.001: #set threshold for training to stop
            break
    return P, Q.T



#### let's try this out on a small scale

In [11]:
#set matrix R
R = [
     [5,3,0,1],
     [4,0,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]
R = np.array(R)
#define N, M, K
N = len(R)
M = len(R[0])
K = 2

In [12]:
#set randomized values for P and Q
P = np.random.rand(N,K)
Q = np.random.rand(M,K)

In [13]:
#call function
nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)

100% (10000 of 10000) |##################| Elapsed Time: 0:00:04 Time:  0:00:04


In [14]:
nR

array([[4.97693304, 2.98300361, 2.73081787, 1.00155351],
       [3.98114748, 2.40185278, 2.41268242, 0.99971702],
       [1.00847698, 0.9810141 , 6.03172882, 4.96864773],
       [0.9958494 , 0.89566452, 4.89322522, 3.98173764],
       [1.20524295, 1.01913872, 4.97859805, 3.99819575]])

### now let's try this on a real dataset

In [16]:
R = np.array(data)
N = len(R)
M = len(R[0])
K = 2

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)


 40% (406 of 1000) |########             | Elapsed Time: 0:43:02 ETA:   1:00:19

KeyboardInterrupt: 

In [17]:
nR

array([[4.97693304, 2.98300361, 2.73081787, 1.00155351],
       [3.98114748, 2.40185278, 2.41268242, 0.99971702],
       [1.00847698, 0.9810141 , 6.03172882, 4.96864773],
       [0.9958494 , 0.89566452, 4.89322522, 3.98173764],
       [1.20524295, 1.01913872, 4.97859805, 3.99819575]])