In [43]:
import numpy as np
import pandas as pd 

rating = pd.read_csv('rating.csv')
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [44]:
rating.loc[rating.rating == -1, "rating"] = 0
rating.isnull().mean()

user_id     0.0
anime_id    0.0
rating      0.0
dtype: float64

In [45]:
len(rating)

7813737

In [46]:
rating.shape[:]

(7813737, 3)

In [47]:
rating.describe()

Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.072,6.332991
std,20997.95,8883.95,3.368955
min,1.0,1.0,0.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [51]:
rating = rating[rating.user_id <= 1000]

In [52]:
from sklearn.model_selection import train_test_split
df_train, df_valid = train_test_split(rating, test_size=0.3)

df_train.head()

Unnamed: 0,user_id,anime_id,rating
13245,166,30544,8
41177,446,1003,8
35762,392,28677,7
23197,274,14719,0
42438,455,982,4


In [53]:
rating.fillna(0, inplace = True)

In [55]:
len(df_train), len(df_valid)

(33166, 14215)

In [56]:
len(df_train)

33166

In [57]:
df_train = df_train.sort_values(by=['user_id', 'anime_id'])
df_train.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,0
2,1,79,0
3,1,226,0
4,1,241,0
5,1,355,0


In [58]:
df_train.iloc[:, :2] -= 1
df_valid.iloc[:, :2] -= 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Unnamed: 0,user_id,anime_id,rating
0,0,19,0
2,0,78,0
3,0,225,0
4,0,240,0
5,0,354,0


In [59]:
df_train.describe()

Unnamed: 0,user_id,anime_id,rating
count,33166.0,33166.0,33166.0
mean,267.001568,10906.374359,6.10354
std,144.611242,9104.86984,3.54651
min,0.0,0.0,0.0
25%,151.0,2592.0,5.0
50%,280.0,9723.0,7.0
75%,391.0,16781.0,9.0
max,499.0,34239.0,10.0


In [60]:
df_train = df_train.to_numpy()
df_valid = df_valid.to_numpy()
df_train


array([[    0,    19,     0],
       [    0,    78,     0],
       [    0,   225,     0],
       ...,
       [  499, 20506,     9],
       [  499, 20846,     9],
       [  499, 21602,     9]], dtype=int64)

In [88]:
class MF(object):
    def __init__(self, Y, K, X=None, W=None, lambda_=0.1, alpha=0.2, epochs=100):
        """
    
        :param Y: utility matrix
        :param K: X columns latent feature and W rows
        :param X: the latent feature (iterm) matrix
        :param W: the users matrix
        :param lambda_: regularization param avoid situation overfit (default 0.1)
        :param alpha: learning rate
        :param epochs: number of training loop (default 100)
        """
        self.__Y = Y
        # normalized data, update later in normalized_Y function
        self.__Y = self.__Y.copy()
        self.__K = K

        self.__lambda_ = lambda_
        self.__alpha = alpha
        self.__epochs = epochs

        # number of users, items, and ratings
        self.__users_count = int(np.max(Y[:, 0])) + 1
        self.__items_count = int(np.max(Y[:, 1])) + 1
        self.__ratings_count = Y.shape[0]
        self.__mu = np.zeros(self.__users_count)

        # random value iterm feature and weight matrix
        self.__X = np.random.randn(self.__items_count, K)
        self.__W = np.random.randn(K, self.__users_count)


    def normalized(self):
        """
        this method is used to normalized ratings
        :return:
        """
        mu = np.zeros(self.__users_count)

        for i in range(self.__users_count):
            indices_user_i = np.where(self.__Y[:, 0] == i)[0].astype(np.int32)
            ratings = []
            for j in indices_user_i:
                if float(self.__Y[j, 2]) != float(0):
                    ratings.append(self.__Y[j, 2])
            if len(ratings):
                _mean = np.mean(ratings)
            else:
                _mean = 0
            mu[i] = _mean
            # normalized
            if _mean != 0:
                for j in indices_user_i:
                    if float(self.__Y[j, 2]) != float(0):
                        self.__Y[j, 2] -= mu[i]
        self.__mu = mu

    def cost_function(self):
        """
        this method is used to calculate the cost function
        :return: cost function J
        """
        J = 0
        for i in range(self.__ratings_count):
            user = int(self.__Y[i, 0])
            item = int(self.__Y[i, 1])
            rate = self.__Y[i, 2]
            J += (1 / (2 * self.__ratings_count)) * np.square(rate - self.__X[item, :].dot(self.__W[:, user]))
        # regularized
        J += (self.__lambda_ / 2) * (
                np.linalg.norm(self.__X, ord="fro") + np.linalg.norm(self.__W, ord="fro"))  # Frobenius Norm
        return J

    def get_items_rated_by_user(self, user_id):
        """
        get all items which are rated by user user_id and get the corresponding rates
        :param user_id: id of target user
        :return: array of item ids and ratings
        """
        indices_user = np.where(self.__Y[:, 0] == user_id)[0].astype(np.int32)
        item_ids = self.__Y[indices_user, 1].astype(np.int32)
        ratings = self.__Y[indices_user, 2].astype(np.float32)
        return item_ids, ratings

    def get_users_rating_item(self, item_id):
        """
        get all users who rated item item_id and get the corresponding rates
        :param item_id: id of item that need to find users who rated it
        :return: array of user ids and ratings
        """
        indices_item = np.where(self.__Y[:, 1] == item_id)[0].astype(np.int32)
        user_ids = self.__Y[indices_item, 0].astype(np.int32)
        ratings = self.__Y[indices_item, 2].astype(np.float32)
        return user_ids, ratings

    def update_x(self):
        """
        update rows of X matrix
        :return:
        """
        for i in range(self.__items_count):
            user_ids, ratings = self.get_users_rating_item(i)
            Wi = self.__W[:, user_ids]
            self.__X[i, :] = self.__X[i, :] - self.__alpha * (
                        -(1 / self.__ratings_count) * ((ratings - np.dot(self.__X[i, :], Wi)).dot(Wi.T))
                        + (self.__lambda_ * self.__X[i, :])).reshape((-1, self.__K))

    def update_w(self):
        """
        update columns of W matrix
        :return:
        """
        for i in range(self.__users_count):
            item_ids, ratings = self.get_items_rated_by_user(i)
            Xi = self.__X[item_ids, :]
            self.__W[:, i] = self.__W[:, i] - self.__alpha * (
                        -(1 / self.__ratings_count) * Xi.T.dot(ratings - Xi.dot(self.__W[:, i]))
                        + self.__lambda_ * self.__W[:, i]).reshape((self.__K, ))

    def matrix_factorization(self):
        """
        implementation of matrix factorization algo
        :return:
        """
        self.normalized()
        for i in range(self.__epochs):
            self.update_x()
            self.update_w()
            mse_train = self.mse_evaluate(self.__Y)
            print("epoch:", i + 1, "cost:", self.cost_function(), "mse:", mse_train)

    def predict(self, user_id, item_id):
        """
        this method is used to make prediction about rating for item item_id of user user_id
        :param user_id: id of user target
        :param item_id: id of item target
        :return: prediction
        """
        pred_result = self.__X[item_id, :].dot(self.__W[:, user_id]) + self.__mu[user_id]
        # truncate if results are out of range [0, 10]
        if pred_result < 0:
            return 0
        elif pred_result > 10:
            return 10
        return pred_result

    def mse_evaluate(self, testing_set):
        """
        this method is used to evaluate the accuracy of our model using MSE
        :param testing_set: our dataset for testing
        :return: MSE
        """
        number_of_test = testing_set.shape[0]
        square_error = 0
        for i in range(number_of_test):
            prediction = self.predict(testing_set[i, 0], testing_set[i, 1])
            square_error += np.square(prediction - testing_set[i, 2])
        mean_square_error = square_error / number_of_test
        return mean_square_error

    def recommend(self, user_id):
        """
        Determine all items should be recommended for user u
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        indices_of_user = np.where(self.__Y[:, 0] == user_id)[0]
        items_rated_by_user = self.__Y[indices_of_user, 1].tolist()
        recommended_items = []
        for i in range(self.__items_count):
            if i not in items_rated_by_user:
                rating = self.predict(user_id, i)
                if rating > 9:
                    recommended_items.append(i)
        return recommended_items

    def print_recommendation(self, user_id):
        """
        print all items which should be recommended for each user
        """
        print('Recommendation: ')
        recommended_items = self.recommend(user_id)
        print('Recommend item(s):', recommended_items, 'to user', user_id)

In [77]:
result_train = MF(df_train, K = 10, epochs=20)
result_train.matrix_factorization()

test_evaluate = result_train.mse_evaluate(df_valid)
print("Test MSE:", test_evaluate)

epoch: 1 cost: 36.948600805895616 mse: 58.20612169258698
epoch: 2 cost: 35.946908804537394 mse: 58.358908673272865
epoch: 3 cost: 34.98741596295601 mse: 58.50708657870324
epoch: 4 cost: 34.0674186800968 mse: 58.64973408289202
epoch: 5 cost: 33.18442890311204 mse: 58.78645080755899
epoch: 6 cost: 32.33615514120839 mse: 58.91766639033367
epoch: 7 cost: 31.520485271955668 mse: 59.04161086841274
epoch: 8 cost: 30.735470958579473 mse: 59.15873583909506
epoch: 9 cost: 29.979313516585215 mse: 59.2685423937862
epoch: 10 cost: 29.250351085538767 mse: 59.36963305240378
epoch: 11 cost: 28.54704697726602 mse: 59.460949443905314
epoch: 12 cost: 27.86797908537954 mse: 59.54269906340671
epoch: 13 cost: 27.211830253129097 mse: 59.61538244565062
epoch: 14 cost: 26.577379507287727 mse: 59.68079029870133
epoch: 15 cost: 25.96349407530134 mse: 59.73854018686919
epoch: 16 cost: 25.369122111388002 mse: 59.7894623052833
epoch: 17 cost: 24.79328606480236 mse: 59.832757561894354
epoch: 18 cost: 24.235076630190

In [92]:
RMSE = result_train.mse_evaluate(df_valid)
print(RMSE)

12.945038873284396


In [90]:
test_user = result_train.recommend(232)
len(test_user)

33302