In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [69]:
class CF:
    def __init__(self, Y_data, k, dist_func=cosine_similarity, uuCF=1):
        self.uuCF = uuCF # uuCF=1: user-user, else item-item
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]] # rating matrix (not utility matrix)
                                                               # shape = (total no. of ratings, 3)
        self.k = k # no. of neighbor points, e.g: k = 2
        self.dist_func = dist_func # similarity function
        self.Ybar_data = None # Normalized utility matrix
        self.n_users = int(np.amax(self.Y_data[:, 0])) + 1 # e.g: n_users = 7
        self.n_items = int(np.amax(self.Y_data[:, 1])) + 1 # e.g: n_items = 5
    

    def add(self, new_data):
        """
        Update Y_data when new ratings come
        (suppose there is no new user or item)
        """

        self.Y_data = np.concatenate((self.Y_data, new_data), axis=0)
        

    def normalize_Y(self):
        users =self.Y_data[:, 0] # get users_id of all users
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,)) # means of ratings for every users

        for n in range(self.n_users):
            user_ids = np.where(users == n)[0]
            # item_ids = self.Y_data[user_ids, 1]
            ratings = self.Y_data[user_ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[n] = m
            self.Ybar_data[user_ids, 2] = ratings - self.mu[n]

        self.Ybar = sparse.coo_matrix((self.Ybar_data[:,2], (self.Ybar_data[:, 1], self.Ybar_data[:, 0])),
                                      (self.n_items,self.n_users))
        self.Ybar = self.Ybar.tocsr()

        """
        E.g: Ybar_data = 
                    [[ 0.          0.          1.75      ]
                     [ 0.          1.          0.75      ]
                     [ 0.          3.         -1.25      ]
                     [ 0.          4.         -1.25      ]
                     [ 1.          0.          2.25      ]
                     [ 1.          2.          1.25      ]
                     [ 1.          3.         -0.75      ]
                     [ 1.          4.         -2.75      ]
                     [ 2.          0.         -0.5       ]
                     [ 2.          2.         -1.5       ]
                     [ 2.          3.          0.5       ]
                     [ 2.          4.          1.5       ]
                     [ 3.          0.         -1.33333333]
                     [ 3.          1.         -1.33333333]
                     [ 3.          3.          2.66666667]
                     [ 4.          0.         -1.5       ]
                     [ 4.          3.          1.5       ]
                     [ 5.          1.          0.5       ]
                     [ 5.          2.         -0.5       ]
                     [ 6.          2.         -2.33333333]
                     [ 6.          3.          0.66666667]
                     [ 6.          4.          1.66666667]]
                
             Ybar = [[ 1.75        2.25       -0.5        -1.33333333 -1.5         0.         0.        ]
                     [ 0.75        0.          0.         -1.33333333  0.          0.5        0.        ]
                     [ 0.          1.25       -1.5         0.          0.         -0.5       -2.33333333]
                     [-1.25       -0.75        0.5         2.66666667  1.5         0.         0.66666667]
                     [-1.25       -2.75        1.5         0.          0.          0.         1.66666667]]
        """
    
    
    def similarity(self):
        self.S = self.dist_func(self.Ybar.T,self.Ybar.T) # similarity matrix

    
    def refresh(self):
        self.normalize_Y()
        self.similarity()
    

    def fit(self):
        self.refresh()

        
    def __pred(self, u, i,normalized=1): # e.g: u = 0, i = 2
        """
        Predict rating of user y for item i (normalized)
        """
        # Step 1: find all users who rated item i
        ids = np.where(self.Y_data[:, 1] == i)[0] # ids = [5 9 18 19]
        users_rated_i = (self.Y_data[ids, 0]).astype('int') # users_rated_i = [1 2 5 6]

        # Step 2: Find similarity between user u and other users
        sim = self.S[u, users_rated_i] # sim = [0.83307435 -0.5809475 0.20412415 -0.38133693]

        # Step 3: Find k most similar users to user u and get corresponding similarity values
        a = np.argsort(sim)[-self.k:] # a = [2 0]
        nearest_s = sim[a] # nearest_s = [0.20412415 0.83307435]

        # Get ratings from those users to item i
        r = self.Ybar[i, users_rated_i[a]] # r = [-0.5, 1.25]

        # Step 4: Calculate predicted rating
        if normalized:
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)
        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
        

    def pred(self, u, i, normalized=1):
        if self.uuCF:
            return self.__pred(u,i, normalized)
        return self.__pred(i, u, normalized)
    

    def recommend(self, u, normalized = 1): # e.g: u = 3
        """
        Determine all items should be recommended for user u (uuCF =1)
        or all users who might have interest on item u (uuCF = 0).
        The decision is made based on all i such that self.pred(u, i) > 0.
        Suppose we are considering items which have not been rated by u yet. 
        """

        # Get all items rated by user u
        ids = np.where(self.Y_data[:, 0] == u)[0] # ids = [12 13 14]
        items_rated_by_u = self.Y_data[ids, 1].tolist() # items_rated_by_u = [0 1 3]

        recommended_items = []

        for i in range(self.n_items):
            if i not in items_rated_by_u: # i = 2, 4
                ratings = self.__pred(u, i) # ratings = -1.8358682492385117, 1.5671736236903118
                if ratings > 0:
                    recommended_items.append(i)
                    
        return recommended_items # recommended_items = [4]

    
    def print_recommendation(self):
        print('Recommendation:')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print('Recommend item(s):', recommended_items, 'to user', u)
            else:
                print('Recommend item', u, 'to user(s):', recommended_items)

# User-user CF

In [71]:
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('ex.dat', sep=' ', names=r_cols,
                     encoding='latin-1')
Y_data = ratings.values

rs = CF(Y_data, k=2,uuCF=1)
rs.fit()
rs.print_recommendation()

Recommendation:
Recommend item(s): [2] to user 0
Recommend item(s): [1] to user 1
Recommend item(s): [] to user 2
Recommend item(s): [4] to user 3
Recommend item(s): [4] to user 4
Recommend item(s): [0, 3, 4] to user 5
Recommend item(s): [1] to user 6


# Item-item CF

In [72]:
rs = CF(Y_data, k=2, uuCF=0)
rs.fit()
rs.print_recommendation()

Recommendation:
Recommend item 0 to user(s): []
Recommend item 1 to user(s): [1]
Recommend item 2 to user(s): [0]
Recommend item 3 to user(s): [5]
Recommend item 4 to user(s): [3, 4, 5]


# MovieLens 100k

In [74]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values # shape = (90570, 4)
rate_test = ratings_test.values # shape = (9430, 4)

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

# User-user
rs = CF(rate_train, k=30, uuCF=1)
rs.fit()

n_test = rate_test.shape[0]
SE = 0
for n in range(n_test):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized=0)
    SE += (pred - rate_test[n,2 ])**2
RMSE = np.sqrt(SE/n_test)

print("User-user CF: RMSE =", RMSE)

# Item-item
rs = CF(rate_train, k = 30, uuCF = 0)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print ("Item-item CF: RMSE =", RMSE)

User-user CF: RMSE = 0.9951981100882598
Item-item CF: RMSE = 0.9867912132705384
