In [14]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k # number of neighbor points
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1] 
            # and the corresponding ratings 
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()
    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
    # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]


    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalize)
        return self.__pred(i, u, normalize)
    
    def recommend(self, u, normalized = 1):
        """
        Determine all items should be recommended for user u. (uuCF =1)
        or all users who might have interest on item u (uuCF = 0)
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                    rating = self.__pred(u, i)
                    if rating > 0: 
                        recommended_items.append(i)
            
        return recommended_items 
        
    def print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print ('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print ('    Recommend item(s):', recommended_items, 'to user', u)
            else: 
                print ('    Recommend item', u, 'to user(s) : ', recommended_items)

In [15]:
# data file 
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings.to_numpy()

rs = CF(Y_data, k = 2, uuCF = 1)
rs.fit()

rs.print_recommendation()

Recommendation: 
    Recommend item(s): [2] to user 0
    Recommend item(s): [1] to user 1
    Recommend item(s): [] to user 2
    Recommend item(s): [4] to user 3
    Recommend item(s): [4] to user 4
    Recommend item(s): [0, 3, 4] to user 5
    Recommend item(s): [1] to user 6


In [16]:
ratings

Unnamed: 0,user_id,item_id,rating
0,0,0,5.0
1,0,1,4.0
2,0,3,2.0
3,0,4,2.0
4,1,0,5.0
5,1,2,4.0
6,1,3,2.0
7,1,4,0.0
8,2,0,2.0
9,2,2,1.0


In [17]:
rs.Ybar_data

array([[ 0.        ,  0.        ,  1.75      ],
       [ 0.        ,  1.        ,  0.75      ],
       [ 0.        ,  3.        , -1.25      ],
       [ 0.        ,  4.        , -1.25      ],
       [ 1.        ,  0.        ,  2.25      ],
       [ 1.        ,  2.        ,  1.25      ],
       [ 1.        ,  3.        , -0.75      ],
       [ 1.        ,  4.        , -2.75      ],
       [ 2.        ,  0.        , -0.5       ],
       [ 2.        ,  2.        , -1.5       ],
       [ 2.        ,  3.        ,  0.5       ],
       [ 2.        ,  4.        ,  1.5       ],
       [ 3.        ,  0.        , -1.33333333],
       [ 3.        ,  1.        , -1.33333333],
       [ 3.        ,  3.        ,  2.66666667],
       [ 4.        ,  0.        , -1.5       ],
       [ 4.        ,  3.        ,  1.5       ],
       [ 5.        ,  1.        ,  0.5       ],
       [ 5.        ,  2.        , -0.5       ],
       [ 6.        ,  2.        , -2.33333333],
       [ 6.        ,  3.        ,  0.666

In [18]:
rs.Y_data

array([[0., 0., 5.],
       [0., 1., 4.],
       [0., 3., 2.],
       [0., 4., 2.],
       [1., 0., 5.],
       [1., 2., 4.],
       [1., 3., 2.],
       [1., 4., 0.],
       [2., 0., 2.],
       [2., 2., 1.],
       [2., 3., 3.],
       [2., 4., 4.],
       [3., 0., 0.],
       [3., 1., 0.],
       [3., 3., 4.],
       [4., 0., 1.],
       [4., 3., 4.],
       [5., 1., 2.],
       [5., 2., 1.],
       [6., 2., 1.],
       [6., 3., 4.],
       [6., 4., 5.]])

In [4]:
rs.Ybar.shape

(5, 7)

In [5]:
type ( rs.Ybar)

scipy.sparse._csr.csr_matrix

In [6]:
rs.Ybar.toarray()

array([[5., 5., 2., 0., 1., 0., 0.],
       [4., 0., 0., 0., 0., 2., 0.],
       [0., 4., 1., 0., 0., 1., 1.],
       [2., 2., 3., 4., 4., 0., 4.],
       [2., 0., 4., 0., 0., 0., 5.]])

In [7]:
rs.S

array([[1.        , 0.61758068, 0.62596864, 0.28571429, 0.4504233 ,
        0.51110125, 0.39678004],
       [0.61758068, 1.        , 0.54433105, 0.2981424 , 0.47001599,
        0.26666667, 0.27602622],
       [0.62596864, 0.54433105, 1.        , 0.54772256, 0.61993042,
        0.08164966, 0.92966968],
       [0.28571429, 0.2981424 , 0.54772256, 1.        , 0.9701425 ,
        0.        , 0.6172134 ],
       [0.4504233 , 0.47001599, 0.61993042, 0.9701425 , 1.        ,
        0.        , 0.59878495],
       [0.51110125, 0.26666667, 0.08164966, 0.        , 0.        ,
        1.        , 0.06900656],
       [0.39678004, 0.27602622, 0.92966968, 0.6172134 , 0.59878495,
        0.06900656, 1.        ]])

In [8]:
pd.DataFrame(rs.Ybar.toarray()).to_clipboard()

In [9]:
pd.DataFrame(rs.S).to_clipboard()