In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [137]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import ast
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func # đánh giá độ tương quan giữ 2 user
        self.Ybar_data = None # bản sao của ydata dùng để lưu ma trận nornalize
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 # số lượng User
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1 # số lượng Item
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)

    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,)) # lư

        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1]
            # and the corresponding ratings
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n] + 0.01
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))

        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
        print('Similarity matrix: ',self.S)
    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        print('Y_data: ',self.Y_data)
        self.normalize_Y()
        self.similarity()


    def fit(self):
        self.refresh()


    def __pred(self, u, i, normalized = 1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32) # vi tri user danh gia item
        # Step 2:
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others
        # who already rated i

        if len(users_rated_i) != 0:
          sim = self.S[u, users_rated_i] # matran similarity
          sim = sim + 0.01
          # Step 4: find the k most similarity users
          a = np.argsort(sim)[-self.k:]
          c = users_rated_i[a]
          # and the corresponding similarity levels
          nearest_s = sim[a]
          #print('users_rated_i[a]',users_rated_i[a])
          # How did each of 'near' users rated item i
          r = self.Ybar[i, users_rated_i[a]]
          if normalized:
              # add a small number, for instance, 1e-8, to avoid dividing by 0
              Z = (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)
              return Z,c
          return Z,c
        else:
          return 0,0

    def pred(self, u, i, normalized = 1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)


    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []
        print('Items rated by user {}:'.format(u), items_rated_by_u)

        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating,c = self.__pred(u, i)
                print('Predicted rating for user {} on item {}: {}'.format(u, i, rating))
                if rating > 0:
                    recommended_items.append([rating,i,c])
        sorted_list = sorted(recommended_items, reverse=True)
        four_lines = sorted_list[:20]
        if not four_lines:
            print('No recommendations available for user {}'.format(u))
        arr1 = []
        arr2 = []
        for i in four_lines:
          arr1.append(i[1])
          arr2.append(i[2])
        #four_lines_last_column = [row[-1] for row in four_lines]
        return arr1,arr2



In [138]:
import pandas as pd
ratings_base = pd.read_csv('D:\DATN\AI\comments.csv')
df_book = pd.read_csv('D:\DATN\AI\prepared_data_book.csv')

  ratings_base = pd.read_csv('D:\DATN\AI\comments.csv')
  df_book = pd.read_csv('D:\DATN\AI\prepared_data_book.csv')


In [139]:
ratings_base = ratings_base[['user_id', 'book_id', 'rating']]

In [140]:
df_sorted = ratings_base.sort_values(by='user_id')

In [141]:
df_train = df_sorted[0:2226]

In [142]:
df_sorted.shape

(2226, 3)

In [143]:
books_id = df_train['book_id'].unique()

In [144]:
df_train['user_id'].unique().shape

(173,)

In [145]:
df_book["product_id"] = books_id

In [146]:
output_file = 'prepared_data_book.csv'
df_book.to_csv(output_file, index=False)

In [147]:
output_file_rating = 'comments.csv'
df_train.to_csv(output_file_rating, index=False)

In [148]:
rate_train = df_train.values

In [149]:
rs = CF(rate_train, k =3, uuCF = 1)
rs.fit()

Y_data:  [[   1 1180    4]
 [   1 4893    3]
 [   1 6285    4]
 ...
 [ 173  253    4]
 [ 173  338    3]
 [ 173  268    3]]
Similarity matrix:  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [151]:
u=3
book_id_rcm,user_id_same = rs.recommend(u)

Items rated by user 3: [9014, 9049]
Predicted rating for user 3 on item 0: 0
Predicted rating for user 3 on item 1: 0
Predicted rating for user 3 on item 2: 0
Predicted rating for user 3 on item 3: 0
Predicted rating for user 3 on item 4: 0
Predicted rating for user 3 on item 5: 0
Predicted rating for user 3 on item 6: 0
Predicted rating for user 3 on item 7: 0
Predicted rating for user 3 on item 8: 0
Predicted rating for user 3 on item 9: 0
Predicted rating for user 3 on item 10: 0
Predicted rating for user 3 on item 11: 0
Predicted rating for user 3 on item 12: 0
Predicted rating for user 3 on item 13: 0
Predicted rating for user 3 on item 14: 0
Predicted rating for user 3 on item 15: 0
Predicted rating for user 3 on item 16: 0
Predicted rating for user 3 on item 17: 0
Predicted rating for user 3 on item 18: 0
Predicted rating for user 3 on item 19: 0
Predicted rating for user 3 on item 20: 0
Predicted rating for user 3 on item 21: 0
Predicted rating for user 3 on item 22: 0
Predicte

In [153]:
book_id_rcm

[2009,
 1709,
 1332,
 8627,
 6944,
 8946,
 8615,
 7771,
 3921,
 9997,
 9966,
 9946,
 9909,
 9908,
 9886,
 9882,
 9826,
 9783,
 9773,
 9671]

In [154]:
import pickle
pickle.dump(rs,open('model.pkl','wb'))

In [155]:
model = pickle.load(open('model.pkl','rb'))
print(model.recommend(2))

Items rated by user 2: [8034, 8855, 9762]
Predicted rating for user 2 on item 0: 0
Predicted rating for user 2 on item 1: 0
Predicted rating for user 2 on item 2: 0
Predicted rating for user 2 on item 3: 0
Predicted rating for user 2 on item 4: 0
Predicted rating for user 2 on item 5: 0
Predicted rating for user 2 on item 6: 0
Predicted rating for user 2 on item 7: 0
Predicted rating for user 2 on item 8: 0
Predicted rating for user 2 on item 9: 0
Predicted rating for user 2 on item 10: 0
Predicted rating for user 2 on item 11: 0
Predicted rating for user 2 on item 12: 0
Predicted rating for user 2 on item 13: 0
Predicted rating for user 2 on item 14: 0
Predicted rating for user 2 on item 15: 0
Predicted rating for user 2 on item 16: 0
Predicted rating for user 2 on item 17: 0
Predicted rating for user 2 on item 18: 0
Predicted rating for user 2 on item 19: 0
Predicted rating for user 2 on item 20: 0
Predicted rating for user 2 on item 21: 0
Predicted rating for user 2 on item 22: 0
Pr