# Matrix Factorization Collaborative Filtering

### import dataset

In [17]:
#this notebook follows the tutorial written here: https://medium.com/analytics-vidhya/matrix-factorization-as-a-recommender-system-727ee64683f0
#author's original notebook can be found here: https://medium.com/analytics-vidhya/matrix-factorization-as-a-recommender-system-727ee64683f0

In [3]:
#import required libraries
import pandas as pd
import numpy as np
data = pd.read_csv('../datasets/MF dataset.csv')
data.head()


Unnamed: 0,3704,1924,4837,867,2631,5410,1733,3536,780,3312,...,207,5817,4097,4755,3977,1520,4622,2160,2542,1535
0,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,3,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,4,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,4,0,0,0,0,4,4,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,5,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#reduce the dataset a bit
data = data[data.columns[:1000]].loc[range(1,1001)]
data.head()

Unnamed: 0,3704,1924,4837,867,2631,5410,1733,3536,780,3312,...,1173,3940,1599,1140,1851,4556,4363,864,4026,5841
1,0,3,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,4,0
2,0,0,4,0,0,0,5,0,0,0,...,0,0,4,0,0,5,5,4,0,0
3,0,0,0,4,0,0,0,0,4,4,...,0,4,0,5,5,3,0,0,3,5
4,0,0,0,0,5,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,5
5,0,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### implementation of matrix factorization

In [18]:
#import required libraries
import numpy as np
import pandas as pd
from progressbar import progressbar

#### define matrix factorization function

In [61]:
#R is matrix holding true values, P and Q are the two matrix factors of R, P represents # of columns of P and # of rows of Q
def matrix_factorization(R, P, Q, K, steps=1000, alpha=0.0002, beta=0.02):
    Q = Q.T 
    #Q will be same dimension as P when transposed
    
    #set up iterations, progressbar is handy for tracking for loops that take a while
    for step in progressbar(range(steps)):
        #runs script through number of 'steps', which is # times program runs through process of improvement
        for i in range(len(R)):
            for j in range(len(R[i])):
                #i and j track row num and colum num, respectively. can be used to access any cell in matrix
                if R[i][j]>0: #ensures value is valid
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    #var eij is the difference between true value and prediction - which is achieved by multiplying 
                    #ith row of P and jth row of Q
                    
                    #now perform updating with a simplified gradient descent formula
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
                        
        eR = np.dot(P,Q)
        #need to calculate error to see how well predictions are doing
        e = 0 #error variable 
        
        #calculates the mean squared error for each value and saves it to error variable e
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:], Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j], 2))
        if e < 1.15: #set threshold for training to stop
            break
    return P, Q.T, e



#### let's try this out on a small scale

In [62]:
#set matrix R
R = [
     [5,3,0,1],
     [4,0,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]
R = np.array(R)
#define N, M, K
N = len(R)
M = len(R[0])
K = 2

In [63]:
#set randomized values for P and Q
P = np.random.rand(N,K)
Q = np.random.rand(M,K)

In [64]:
#call function
nP, nQ, err = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)

100% (1000 of 1000) |####################| Elapsed Time: 0:00:01 Time:  0:00:01


In [65]:
nR

array([[3.31256729, 1.62583729, 3.29008474, 3.18153712],
       [2.86161973, 0.97245633, 2.64025074, 2.02420611],
       [2.65557764, 1.58141069, 2.76750905, 3.0165784 ],
       [2.21662145, 1.50125712, 2.3947685 , 2.82176329],
       [4.4449897 , 1.93717508, 4.30055497, 3.85938482]])

In [66]:
err

23.92989420025926

### now let's try this on a real dataset

In [67]:
R = np.array(data)
N = len(R)
M = len(R[0])
K = 2

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)


  0% (3 of 1000) |                       | Elapsed Time: 0:00:42 ETA:   3:58:12

KeyboardInterrupt: 

In [16]:
nR

array([[3.24874064, 3.25695921, 3.61321061, ..., 3.39596391, 3.1324354 ,
        3.58250416],
       [4.69952414, 3.36152106, 3.97813295, ..., 4.34261368, 3.88219232,
        4.57352225],
       [4.84465515, 3.55654022, 4.18535182, ..., 4.5152277 , 4.04593954,
        4.75589791],
       ...,
       [4.09858166, 1.65658958, 2.29001127, ..., 3.24901304, 2.77264732,
        3.41361772],
       [5.4155791 , 3.98025159, 4.68282211, ..., 5.04926629, 4.52494347,
        5.31843085],
       [3.93340544, 3.2824416 , 3.76335454, ..., 3.83263983, 3.47479189,
        4.03943043]])