# Matrix Factorization Collaborative Filtering

### import dataset

In [1]:
#this notebook follows the tutorial written here: https://medium.com/analytics-vidhya/matrix-factorization-as-a-recommender-system-727ee64683f0
#author's original notebook can be found here: https://medium.com/analytics-vidhya/matrix-factorization-as-a-recommender-system-727ee64683f0

In [2]:
#import required libraries
import pandas as pd
import numpy as np
data = pd.read_csv('../datasets/MF dataset.csv')
data.head()


FileNotFoundError: [Errno 2] File ../datasets/MF dataset.csv does not exist: '../datasets/MF dataset.csv'

In [4]:
#reduce the dataset a bit
data = data[data.columns[:1000]].loc[range(1,1001)]
data.head()

Unnamed: 0,3704,1924,4837,867,2631,5410,1733,3536,780,3312,...,1173,3940,1599,1140,1851,4556,4363,864,4026,5841
1,0,3,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,4,0
2,0,0,4,0,0,0,5,0,0,0,...,0,0,4,0,0,5,5,4,0,0
3,0,0,0,4,0,0,0,0,4,4,...,0,4,0,5,5,3,0,0,3,5
4,0,0,0,0,5,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,5
5,0,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### implementation of matrix factorization

In [1]:
#import required libraries
import numpy as np
import pandas as pd
from progressbar import progressbar

#### define matrix factorization function

In [13]:
#R is matrix holding true values, P and Q are the two matrix factors of R, P represents # of columns of P and # of rows of Q
def matrix_factorization(R, P, Q, K, steps=10000, alpha=0.0002, beta=0.02):
    Q = Q.T 
    #Q will be same dimension as P when transposed
    
    #set up iterations, progressbar is handy for tracking for loops that take a while
    for step in progressbar(range(steps)):
        #runs script through number of 'steps', which is # times program runs through process of improvement
        for i in range(len(R)):
            for j in range(len(R[i])):
                #i and j track row num and colum num, respectively. can be used to access any cell in matrix
                if R[i][j]>0: #ensures value is valid
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    #var eij is the difference between true value and prediction - which is achieved by multiplying 
                    #ith row of P and jth row of Q
                    
                    #now perform updating with a simplified gradient descent formula
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
                        
        eR = np.dot(P,Q)
        #need to calculate error to see how well predictions are doing
        e = 0 #error variable 
        
        #calculates the mean squared error for each value and saves it to error variable e
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:], Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j], 2))
        if e < 1.15: #set threshold for training to stop
            break
    return P, Q.T, e



#### let's try this out on a small scale

In [14]:
#set matrix R
R = [
    [1,4,0,2,3],
    [0,0,4,0,3],
    [3,0,5,2,1],
    [5,3,0,1,0],
    [1,0,3,2,0],
  
    ]
R = np.array(R)
#define N, M, K
N = len(R)
M = len(R[0])
K = 2

In [15]:
#set randomized values for P and Q
P = np.random.rand(N,K)
Q = np.random.rand(M,K)

In [16]:
#call function
nP, nQ, err = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)

100% (10000 of 10000) |##################| Elapsed Time: 0:00:07 Time:  0:00:07


In [17]:
nR

array([[ 0.95521421,  3.96954603,  3.32671121,  2.1799951 ,  2.88603378],
       [ 1.38865223,  4.44215501,  3.9846542 ,  2.39881048,  2.98996681],
       [ 3.09214682,  3.78029467,  4.95572652,  1.79800985,  1.1123303 ],
       [ 4.93420852,  2.97968055,  5.95136088,  1.09910079, -0.99509471],
       [ 0.97387733,  3.49179783,  3.03578095,  1.90060105,  2.43851579]])

In [18]:
err

1.378877051345683

In [3]:
#set matrix R
R = [
     [1,0,0,0,0,0,0,2,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,3,2,0,1,0,0,3,1,0,0,3,1,0,],
     [0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,],
     [0,0,0,2,0,3,0,0,0,0,1,0,0,0,1,0,0,0,0,0,2,0,0,0,2,0,0,3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,],
     [3,0,2,1,2,0,0,3,2,0,0,0,0,0,2,0,0,3,2,0,0,1,0,2,0,1,0,0,0,0,0,0,1,0,1,0,3,0,0,0,0,0,0,0,2,0,1,0,0,0,],
     [2,0,0,0,0,0,0,2,1,0,0,0,0,2,0,1,0,3,1,0,0,0,0,0,0,0,0,2,1,0,2,0,0,2,0,2,0,3,0,0,1,0,0,3,1,0,0,2,0,0,]
    ]
R = np.array(R)
#define N, M, K
N = len(R)
M = len(R[0])
K = 2

In [4]:
#set randomized values for P and Q
P = np.random.rand(N,K)
Q = np.random.rand(M,K)

In [5]:
#call function
nP, nQ, err = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)

100% (10000 of 10000) |##################| Elapsed Time: 0:00:30 Time:  0:00:30


In [6]:
nR

array([[ 1.29239879,  0.90756619,  1.45082164, -0.28720382,  1.7776599 ,
         1.18597352,  1.14506156,  2.00363095,  0.90772929,  1.745552  ,
         0.99672516,  1.35128675,  0.49433876,  1.92625094,  1.22292386,
         0.90135472,  0.78939149,  3.03266683,  1.13300506,  0.89461016,
         1.77661859,  0.94397456,  1.51860595,  1.44646307,  0.97473966,
         0.89856386,  0.87243238,  1.90613195,  0.43104421,  0.3753689 ,
         2.07989794,  1.70116952,  0.4394679 ,  1.7664204 ,  0.84653328,
         1.73794557,  1.7842928 ,  3.09200734,  1.96595951,  0.65745728,
         0.94387152,  1.10802696,  1.91021835,  3.05686678,  0.90965714,
         1.54227846,  0.50719124,  2.60896988,  0.99930017,  1.17911236],
       [ 2.54609203,  1.57270646,  1.3297147 ,  1.349642  ,  1.07770567,
         2.43621719,  1.31573962,  2.03071424,  1.57322256,  1.31746806,
         0.91835799,  1.22382759,  1.11554916,  2.34633583,  1.12169795,
         1.22566763,  1.59572968,  1.81822601,  1.

In [67]:
err

5.082339460798906

### now let's try this on a real dataset

In [67]:
R = np.array(data)
N = len(R)
M = len(R[0])
K = 2

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)


  0% (3 of 1000) |                       | Elapsed Time: 0:00:42 ETA:   3:58:12

KeyboardInterrupt: 

In [16]:
nR

array([[3.24874064, 3.25695921, 3.61321061, ..., 3.39596391, 3.1324354 ,
        3.58250416],
       [4.69952414, 3.36152106, 3.97813295, ..., 4.34261368, 3.88219232,
        4.57352225],
       [4.84465515, 3.55654022, 4.18535182, ..., 4.5152277 , 4.04593954,
        4.75589791],
       ...,
       [4.09858166, 1.65658958, 2.29001127, ..., 3.24901304, 2.77264732,
        3.41361772],
       [5.4155791 , 3.98025159, 4.68282211, ..., 5.04926629, 4.52494347,
        5.31843085],
       [3.93340544, 3.2824416 , 3.76335454, ..., 3.83263983, 3.47479189,
        4.03943043]])