In [1]:
from google.colab import drive
drive.mount('/Thesis', force_remount= True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /Thesis


In [2]:
cd "/Thesis/My Drive/Oh_Our_Thesis/recommender-system"

/Thesis/My Drive/Oh_Our_Thesis/recommender-system


In [0]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

from sklearn.decomposition import NMF


class MF(object):
    """docstring for CF"""
    def __init__(self, n_users, n_items, Y_data, K, lam = 0.1, Xinit = None, Winit = None, 
            learning_rate = 0.5, max_iter = 100, print_every = 100, user_based = 1):
        self.Y_raw_data = Y_data
        self.K = K
        # regularization parameter
        self.lam = lam
        # learning rate for gradient descent
        self.learning_rate = learning_rate
        # maximum number of iterations
        self.max_iter = max_iter
        # print results after print_every iterations
        self.print_every = print_every
        # user-based or item-based
        self.user_based = user_based
        # number of users, items, and ratings. Remember to add 1 since id starts from 0

        # self.n_users = int(np.max(Y_data[:, 0])) + 1 
        # self.n_items = int(np.max(Y_data[:, 1])) + 1

        # (+1) is added before pass the param
        self.n_users = int(n_users) # +1
        self.n_items = int(n_items) # +1

        self.n_ratings = Y_data.shape[0]
        
        if Xinit is None: # new
            self.X = np.random.randn(self.n_items, K)
        else: # or from saved data
            self.X = Xinit 
        
        if Winit is None: 
            self.W = np.random.randn(K, self.n_users)
        else: # from saved data
            self.W = Winit
            
        # normalized data, update later in normalized_Y function
        self.Y_data_n = self.Y_raw_data.copy()

    def normalize_Y(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.n_users

        # if we want to normalize based on item, just switch first two columns of data
        else: # item base
            user_col = 1
            item_col = 0 
            n_objects = self.n_items

        users = (self.Y_raw_data[:, user_col] ).astype(np.int32)
        self.mu = np.zeros((n_objects,))
        for n in range(n_objects):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = (self.Y_data_n[ids, item_col]).astype(np.int32)
            # and the corresponding ratings 
            ratings = self.Y_data_n[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Y_data_n[ids, 2] = ratings - self.mu[n]

    def loss(self):
        L = 0 
        for i in range(self.n_ratings):
            # user, item, rating
            n, m, rate = int(self.Y_data_n[i, 0]), int(self.Y_data_n[i, 1]), self.Y_data_n[i, 2]
            L += 0.5*(rate - self.X[m, :].dot(self.W[:, n]))**2

    # take average
        L /= self.n_ratings
        # regularization, don't ever forget this 
        L += 0.5*self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
        return L 


    def get_items_rated_by_user(self, user_id):
        """
        get all items which are rated by user user_id, and the corresponding ratings
        """
        ids = np.where((self.Y_data_n[:,0]).astype(np.int32) == user_id)[0] 
        item_ids = self.Y_data_n[ids, 1].astype(np.int32) # indices need to be integers
        ratings = self.Y_data_n[ids, 2]
        return (item_ids, ratings)


    def get_users_who_rate_item(self, item_id):
        """
        get all users who rated item item_id and get the corresponding ratings
        """
        ids = np.where((self.Y_data_n[:,1]).astype(np.int32) == item_id)[0] 
        user_ids = self.Y_data_n[ids, 0].astype(np.int32)
        ratings = self.Y_data_n[ids, 2]
        return (user_ids, ratings)


    def updateX(self):
        for m in range(self.n_items):
            user_ids, ratings = self.get_users_who_rate_item(m)
            Wm = self.W[:, user_ids]
            # gradient
            grad_xm = -(ratings - self.X[m, :].dot(Wm)).dot(Wm.T)/self.n_ratings + \
                                               self.lam*self.X[m, :]
            self.X[m, :] -= self.learning_rate*grad_xm.reshape((self.K,))

    def updateW(self):
        for n in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(n)
            Xn = self.X[item_ids, :]
            # gradient
            grad_wn = -Xn.T.dot(ratings - Xn.dot(self.W[:, n]))/self.n_ratings + \
                        self.lam*self.W[:, n]
            self.W[:, n] -= self.learning_rate*grad_wn.reshape((self.K,))
            
    def fit(self):
        self.normalize_Y()
        for it in range(self.max_iter):
            self.updateX()
            self.updateW()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y_raw_data)
                print ('iter =', it + 1, ', loss =', self.loss(), ', RMSE train =', rmse_train)


        # model = NMF(n_components=10, init='random', random_state=0)
        # self.W = model.fit_transform(X)
        # self.H = model.components_

    def pred(self, u, i):
        """ 
        predict the rating of user u for item i 
        if you need the un
        """
        u = int(u)
        i = int(i)
        if self.user_based:
            bias = self.mu[u]
        else: 
            bias = self.mu[i]
        pred = self.X[i, :].dot(self.W[:, u]) + bias 
        # truncate if results are out of range [0, 10]
        if pred < 1:
            return 1 
        if pred > 10: 
            return 10 
        return pred 


    def pred_for_user(self, user_id):
        """
        predict ratings one user give all unrated items
        """
        ids = np.where((self.Y_data_n[:, 0]).astype(np.int32) == user_id)[0]
        items_rated_by_u = self.Y_data_n[ids, 1].tolist()              

        y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
        predicted_ratings= []

        for i in range(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))

        return predicted_ratings


    def pred_for_all_user(self):
        all_pre_rating = []

        for user_id in range (self.n_users):
            # predicted_ratings= []
            # ids = np.where((self.Y_data_n[:, 0]).astype(np.int32) == user_id)[0]
            # items_rated_by_u = self.Y_data_n[ids, 1].tolist()           
            y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id] 
            all_pre_rating.append(y_pred.tolist())

        return all_pre_rating 

    def pred_all_usePred(self):
        pred = []
        for i in range(self.n_users):
            tmp = []
            for j in range(self.n_items):
                tmp += [self.pred(i, j)]
            pred += [tmp]
        return pred

    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        return RMSE

In [0]:
tmp = pd.read_json('../Crawl_Data/user_rates_place-ver2.json')

# ratings_x = tmp[['User_Id','Place_Id','Rating','TimeStamp']]
ratings_x = tmp[['User_Id','Place_Id', 'Rating', 'Rating_Space',	'Rating_Location',	'Rating_Quality',	'Rating_Service',	'Rating_Price']]

userId = ratings_x.User_Id.unique()
userId.sort()
mapUserId  = {}
for i, value in enumerate(userId):
  mapUserId[value] = i

placeId = ratings_x.Place_Id.unique()
placeId.sort()
mapPlaceId  = {}
for i, value in enumerate(placeId):
  mapPlaceId[value] = i

tmp_based = ratings_x.values

for i in range(len(tmp_based[:,0])):
  tmp_based[i][0] = mapUserId[tmp_based [i][0]]
for i in range(len(tmp_based[:,1])):
  tmp_based[i][1] = mapPlaceId[tmp_based [i][1]]


# indices start from 0
# tmp_based[:, :2] -= 1
# rate_test[:, :2] -= 1

# tmp_based = tmp_based[:,[0,1]].astype(int)
# tmp_based4
tmp_based_copy = tmp_based

In [0]:
from sklearn.model_selection import train_test_split

tmp_based_copy = tmp_based_copy[tmp_based_copy[:,0].argsort()] # sort theo  user_id
based_train = []
based_test = []
_usersId = (tmp_based_copy[:, 0]).astype(np.int32)

for i in range((max(tmp_based_copy[:,0])).astype(int)):  
  _ids = np.where(_usersId == i)[0].astype(np.int32)
  if (len(_ids) > 4):
    X_train, X_test= train_test_split(tmp_based_copy[_ids], test_size=.2, random_state=42)
    based_train += X_train.tolist()
    based_test += X_test.tolist()
    
based_train = np.array(based_train)
based_test = np.array(based_test)
# X_test = np.array(X_test)

In [25]:
rs = MF(int(max(tmp_based_copy[:,0])), int(max(tmp_based_copy[:,1])), based_train, K = 10, lam = .5, print_every = 10, learning_rate = 0.75, max_iter = 50, user_based = 0)
rs.fit()

# evaluate on test data
RMSE = rs.evaluate_RMSE(based_test)
print ('\nUser-based MF, RMSE =', RMSE)
print ('\n\n')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


iter = 10 , loss = 1.440643391094046 , RMSE train = 0.936062860297831
iter = 20 , loss = 0.44722870584701246 , RMSE train = 0.9360668675068299
iter = 30 , loss = 0.43819352038188675 , RMSE train = 0.936066867889704
iter = 40 , loss = 0.4381113448330287 , RMSE train = 0.9360668678897408
iter = 50 , loss = 0.438110597440447 , RMSE train = 0.9360668678897408

User-based MF, RMSE = 1.831055750803039


In [97]:
M = int(max(tmp_based_copy[:,0])) + 1
N = int(max(tmp_based_copy[:,1])) + 1
pred_allUser = np.matrix(np.zeros((M,N)))
evaluate = np.matrix(np.zeros((M,N)))
for i in range(5):
    rs = MF(M, N, based_train[:,[0,1,i+3]], K = 50, lam = .1, print_every = 10, learning_rate = 0.75, max_iter = 350, user_based = 0)
    rs.fit()
    evaluate += np.matrix(rs.pred_all_usePred())
    pred_allUser += np.matrix(rs.pred_for_all_user())
    RMSE = rs.evaluate_RMSE(based_test[:,[0,1,i+3]])
    print ('\nUser-based MF, RMSE =', RMSE)
    print('\n')

evaluate /= 5
pred_allUser /= 5

pred_allUser = np.array(pred_allUser)
evaluate = np.array(evaluate)
# evaluate on test data
# RMSE = rs.evaluate_RMSE(based_test)
# print ('\nUser-based MF, RMSE =', RMSE)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


iter = 10 , loss = 24.096722517505555 , RMSE train = 1.6130697544813795
iter = 20 , loss = 11.003053654121855 , RMSE train = 1.0838444782373085
iter = 30 , loss = 5.33123505808014 , RMSE train = 1.0536885908819187
iter = 40 , loss = 2.7442525434766476 , RMSE train = 1.0526462219973591
iter = 50 , loss = 1.5584875721461993 , RMSE train = 1.052700268966099
iter = 60 , loss = 1.014723607387887 , RMSE train = 1.052731584755556
iter = 70 , loss = 0.7653566550482747 , RMSE train = 1.0527407123156154
iter = 80 , loss = 0.650998874691344 , RMSE train = 1.0527430937905302
iter = 90 , loss = 0.5985555479506109 , RMSE train = 1.0527436886880126
iter = 100 , loss = 0.574505641296033 , RMSE train = 1.052743833477713
iter = 110 , loss = 0.5634766451923835 , RMSE train = 1.0527438680705767
iter = 120 , loss = 0.5584188819909333 , RMSE train = 1.0527438762191095
iter = 130 , loss = 0.5560994523143941 , RMSE train = 1.0527438781170846
iter = 140 , loss = 0.5550357887201358 , RMSE train = 1.052743878555

In [98]:
# evalute 
SE = 0
for i in range(len(based_test)):
    SE += (pred_allUser[int(based_test[i][0])][int(based_test[i][1])] - based_test[i, 2])**2 

RMSE = np.sqrt(SE/len(based_test))
print ('\nUser-based MF-5models, RMSE =', RMSE)


User-based MF-5models, RMSE = 2.0254322911240505
