In [1]:
import pandas as pd
import numpy as np
from numpy import linalg as la

In [2]:
np.set_printoptions(legacy='1.25')

In [3]:
def loadData():
    M =    [[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0],
           [2, 1, 0, 2, 0, 5, 3, 0, 1, 0, 1]]
    # return(np.asmatrix(M))
    return(np.array(M))

In [4]:
# Note: These functions assume that the input vectors are Numpy arrays. 

def euclidSim(inA,inB):
    return 1.0/(1.0 + la.norm(inA - inB))

def pearsonSim(inA,inB):
    if len(inA) < 3 : return 1.0
    return 0.5+0.5*np.corrcoef(inA, inB, rowvar = 0)[0][1]

def cosineSim(inA,inB):
    num = np.dot(A, B)
    num = num.astype(float)
    denom = la.norm(inA)*la.norm(inB)
    return 0.5 + 0.5 * (num / denom)

In [5]:
# First let's create two fake user profiles to test our similarity functions

A = np.array([2,3,0,1,0,4,-5])
B = np.array([0,1,2,-4,2,0,3])

# Coverting these Numpy arrays into the Numpy Matrix type

# A = np.asmatrix(A)  # Using np.asmatrix instead of np.mat (depricated)
# B = np.asmatrix(B) 

In [6]:
print(euclidSim(A,B))

0.08333333333333333


In [7]:
print(cosineSim(A,B))

0.3150010839748479


In [8]:
print(pearsonSim(A,B))

0.2665380020120951


In [9]:
def standEst(dataMat, user, simMeas, item):
    # dataMat is assumed to be 2d Numpy array, e.g., representing a user-item rating matrix
    # user is the index of a single user (a row) in the dataMat
    # item is the index of a single item (a colums) in the dataMat
    
    n = np.shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0: 
            continue
        overLap = np.nonzero(np.logical_and(dataMat[:,item]>0, dataMat[:,j]>0))[0]
        if len(overLap) == 0: 
            similarity = 0
        else: 
            similarity = simMeas(dataMat[overLap,item], dataMat[overLap,j])
        #print('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal

In [10]:
def svdEst(dataMat, user, simMeas, item):
    # dataMat is assumed to be 2d Numpy array, e.g., representing a user-item rating matrix
    # user is the index of a single user (a row) in the dataMat
    # item is the index of a single item (a colums) in the dataMat

    n = np.shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    # The SVD computation below requires the data to be of Numpy Matrix type
    data=np.asmatrix(dataMat)
    U,Sigma,VT = la.svd(data)
    Sig4 = np.asmatrix(np.eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix
    xformedItems = data.T * U[:,:4] * Sig4.I  #create transformed items (* here is matrix multiplication)
    for j in range(n):
        userRating = data[user,j]
        if userRating == 0 or j==item: continue
        similarity = simMeas(xformedItems[item,:].T, xformedItems[j,:].T)
        #print('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal

In [11]:
def recommend(dataMat, user, N=3, simMeas=pearsonSim, estMethod=standEst):
    unratedItems = np.nonzero(dataMat[user,:]==0)[0] #find unrated items 
    if len(unratedItems) == 0: return 'you rated everything'
    itemScores = []
    for item in unratedItems:
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        itemScores.append((item, estimatedScore))
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]

In [13]:
data = loadData()
print(data)

[[0 0 0 0 0 4 0 0 0 0 5]
 [0 0 0 3 0 4 0 0 0 0 3]
 [0 0 0 0 4 0 0 1 0 4 0]
 [3 3 4 0 0 0 0 2 2 0 0]
 [5 4 5 0 0 0 0 5 5 0 0]
 [0 0 0 0 5 0 1 0 0 5 0]
 [4 3 4 0 0 0 0 5 5 0 1]
 [0 0 0 4 0 4 0 0 0 0 4]
 [0 0 0 2 0 2 5 0 0 1 2]
 [0 0 0 0 5 0 0 0 0 4 0]
 [1 0 0 0 0 0 0 1 2 0 0]
 [2 1 0 2 0 5 3 0 1 0 1]]


In [14]:
dataMat=np.asmatrix(data)
U,Sigma,VT = la.svd(dataMat)
Sig4 = np.asmatrix(np.eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix
xItems = data.T * U[:,:4] * Sig4.I  #create transformed items
print(xItems)

[[-0.45889187  0.03170418 -0.01809311  0.11036907]
 [-0.3622062   0.04692163 -0.01141864  0.04254964]
 [-0.45537578  0.10423397 -0.00800224 -0.05403528]
 [-0.051868   -0.39701598 -0.05950012  0.06753374]
 [-0.01726089 -0.08392364  0.71965471 -0.13098077]
 [-0.09964753 -0.67126432 -0.11207725 -0.04038616]
 [-0.04619366 -0.25745027  0.05860349  0.87744841]
 [-0.45397947  0.09523267  0.03757744 -0.09430203]
 [-0.46909953  0.0672883  -0.0131357   0.00911101]
 [-0.01955354 -0.10798751  0.67233514  0.01344801]
 [-0.09629148 -0.52832652 -0.09176174 -0.42505074]]


In [15]:
print(VT[:4,:].T)

[[-0.45889187  0.03170418 -0.01809311  0.11036907]
 [-0.3622062   0.04692163 -0.01141864  0.04254964]
 [-0.45537578  0.10423397 -0.00800224 -0.05403528]
 [-0.051868   -0.39701598 -0.05950012  0.06753374]
 [-0.01726089 -0.08392364  0.71965471 -0.13098077]
 [-0.09964753 -0.67126432 -0.11207725 -0.04038616]
 [-0.04619366 -0.25745027  0.05860349  0.87744841]
 [-0.45397947  0.09523267  0.03757744 -0.09430203]
 [-0.46909953  0.0672883  -0.0131357   0.00911101]
 [-0.01955354 -0.10798751  0.67233514  0.01344801]
 [-0.09629148 -0.52832652 -0.09176174 -0.42505074]]


In [16]:
print(Sig4)

[[15.92756836  0.          0.          0.        ]
 [ 0.         12.46154916  0.          0.        ]
 [ 0.          0.         11.11778237  0.        ]
 [ 0.          0.          0.          5.39644238]]


In [17]:
print(Sig4.I)

[[0.06278422 0.         0.         0.        ]
 [0.         0.08024684 0.         0.        ]
 [0.         0.         0.089946   0.        ]
 [0.         0.         0.         0.18530727]]


In [18]:
user = 4
recommendations = recommend(data, user, N=4, simMeas=cosineSim, estMethod=standEst)
print(recommendations)

[(4, 5.0), (9, 5.0), (10, 4.7551180370248725), (3, 4.588235294117647)]


In [19]:
print("Recommended Items for User", user, ":\n")
for i, p in recommendations:
    print("Item ", i, "with predicted rating: ", p, "\n")


Recommended Items for User 4 :

Item  4 with predicted rating:  5.0 

Item  9 with predicted rating:  5.0 

Item  10 with predicted rating:  4.7551180370248725 

Item  3 with predicted rating:  4.588235294117647 



In [20]:
user = 4
recommendations = recommend(data, user, N=4, simMeas=pearsonSim, estMethod=standEst)
print(recommendations)

[(4, 5.0), (9, 5.0), (10, 4.8), (3, 4.666666666666667)]


In [21]:
print("Recommended Items for User", user, ":\n")
for i, p in recommendations:
    print("Item ", i, "with predicted rating: ", p, "\n")

Recommended Items for User 4 :

Item  4 with predicted rating:  5.0 

Item  9 with predicted rating:  5.0 

Item  10 with predicted rating:  4.8 

Item  3 with predicted rating:  4.666666666666667 



In [22]:
user = 4
recommendations = recommend(data, user, N=4, simMeas=cosineSim, estMethod=svdEst)
print(recommendations)

[(3, 4.75618061775972), (9, 4.755842011596403), (5, 4.755832399185177), (10, 4.755829869022552)]


In [23]:
print("Recommended Items for User", user, ":\n")
for i, p in recommendations:
    print("Item ", i, "with predicted rating: ", p, "\n")

Recommended Items for User 4 :

Item  3 with predicted rating:  4.75618061775972 

Item  9 with predicted rating:  4.755842011596403 

Item  5 with predicted rating:  4.755832399185177 

Item  10 with predicted rating:  4.755829869022552 



In [24]:
user = 4
item = 4
r = standEst(data, user, pearsonSim, item)
print(r)

5.0


In [25]:
user = 3
item = 5
r = standEst(data, user, cosineSim, item)
print(r)

2.56


In [26]:
def cross_validate_user(dataMat, user, test_ratio, estMethod=standEst, simMeas=pearsonSim):
	dataMat = np.array(dataMat)
	number_of_items = np.shape(dataMat)[1]
	rated_items_by_user = np.array([i for i in range(number_of_items) if dataMat[user,i]>0])
	test_size = int(test_ratio * len(rated_items_by_user))
	test_indices = np.random.randint(0, len(rated_items_by_user), test_size)
	withheld_items = rated_items_by_user[test_indices]
	original_user_profile = np.copy(dataMat[user])
	dataMat[user, withheld_items] = 0 # So that the withheld test items is not used in the rating estimation below
	error_u = 0.0
	count_u = len(withheld_items)

	# Compute absolute error for user u over all test items
	for item in withheld_items:
		# Estimate rating on the withheld item
		estimatedScore = estMethod(dataMat, user, simMeas, item)
		error_u = error_u + abs(estimatedScore - original_user_profile[item])	
	
	# Now restore ratings of the withheld items to the user profile
	for item in withheld_items:
		dataMat[user, item] = original_user_profile[item]
		
	# Return sum of absolute errors and the count of test cases for this user
	# Note that these will have to be accumulated for each user to compute MAE
	return error_u, count_u

In [31]:
user = 6
error, count = cross_validate_user(data[:100,:], user, 0.3, estMethod=standEst, simMeas=cosineSim)
print(error, count)

3.0 1


In [30]:
user = 6
error, count = cross_validate_user(data[:100,:], user, 0.3, estMethod=standEst, simMeas=cosineSim)
print(error, count)

4.429704871318257 1


In [36]:
user = 6
error, count = cross_validate_user(data[:100,:], user, 0.3, estMethod=standEst)
print(error, count)

0.7852300520543158 1
