In [1]:
from __future__ import (absolute_import, division, print_function, unicode_literals)

import os

import numpy as np
import pandas as pd

from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

## Data Load - Movie lens 1M data

In [2]:
# 데이터가 없으면 Y해서 데이터 로드
data = Dataset.load_builtin('ml-1m')
df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"])

In [3]:
file_path = os.path.expanduser('~/.surprise_data/ml-1m/ml-1m/ratings.dat')
reader = Reader(line_format = 'user item rating timestamp', sep = '::')
data = Dataset.load_from_file(file_path, reader = reader)

In [4]:
algo = SVD()
cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8718  0.8728  0.8723  0.8747  0.8762  0.8736  0.0017  
MAE (testset)     0.6839  0.6849  0.6853  0.6867  0.6883  0.6858  0.0015  
Fit time          57.85   59.03   60.90   70.65   56.04   60.89   5.13    
Test time         2.45    2.74    4.08    2.70    3.47    3.09    0.60    


{'test_rmse': array([0.87180998, 0.87277481, 0.87232613, 0.87472084, 0.87623166]),
 'test_mae': array([0.68391942, 0.68492126, 0.68533639, 0.68671832, 0.68826271]),
 'fit_time': (57.8542320728302,
  59.0256450176239,
  60.89537787437439,
  70.65435290336609,
  56.03543186187744),
 'test_time': (2.4530200958251953,
  2.7428760528564453,
  4.076212167739868,
  2.6992640495300293,
  3.4739699363708496)}

In [5]:
df = pd.DataFrame(data.raw_ratings, columns = ['uid', 'iid', 'rate', 'timestamp'])

In [6]:
df

Unnamed: 0,uid,iid,rate,timestamp
0,1,1193,5.0,978300760
1,1,661,3.0,978302109
2,1,914,3.0,978301968
3,1,3408,4.0,978300275
4,1,2355,5.0,978824291
5,1,1197,3.0,978302268
6,1,1287,5.0,978302039
7,1,2804,5.0,978300719
8,1,594,4.0,978302268
9,1,919,4.0,978301368


In [7]:
user = list(set(df.uid))
item = list(set(df.iid))
user_i = list(map(int, user))
item_i = list(map(int, item))

In [8]:
max(item_i)

3952

## Basic CF algorithm

In [9]:
from surprise import KNNBasic
trainset = data.build_full_trainset()
sim_options = {'name' : 'cosine', 'user_based' : True}

In [10]:
algo = KNNBasic(k = 40, min_k = 1, simoptions = sim_options)
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1129ead68>

In [11]:
pred_rating_basic = np.zeros((max(user_i) + 1, max(item_i) + 1))
for u in user:
    iids = df[df.uid == u]
    for i in range(1, len(iids) + 1):
        iid = iids[i - 1:i].iid.values[0]
        r_ui = iids[i - 1:i].rate.values[0]
        pred = algo.predict(u, iid, r_ui, verbose = False)
        pred_rating_basic[int(u)][int(iid)] = pred.est

In [12]:
pred_rating_basic

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 4.85606058, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.62587977, 0.        , ..., 0.        , 0.        ,
        0.        ]])

## CF algorithm with mean

In [13]:
from surprise import KNNWithMeans

In [14]:
algo = KNNWithMeans(k = 40, min_k = 1, simoptions = sim_options)
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1129eae10>

In [15]:
pred_rating_mean = np.zeros((max(user_i) + 1, max(item_i) + 1))
for u in user:
    iids = df[df.uid == u]
    for i in range(1, len(iids) + 1):
        iid = iids[i - 1:i].iid.values[0]
        r_ui = iids[i - 1:i].rate.values[0]
        pred = algo.predict(u, iid, r_ui, verbose = False)
        pred_rating_mean[int(u)][int(iid)] = pred.est

In [16]:
pred_rating_mean

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 5.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 3.4776764, 0.       , ..., 0.       , 0.       ,
        0.       ]])

## CF algorithm with z-score

In [17]:
from surprise import KNNWithZScore

In [18]:
algo = KNNWithZScore(k = 40, min_k = 1, simoptions = sim_options)
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x124b634e0>

In [19]:
pred_rating_z = np.zeros((max(user_i) + 1, max(item_i) + 1))
for u in user:
    iids = df[df.uid == u]
    for i in range(1, len(iids) + 1):
        iid = iids[i - 1:i].iid.values[0]
        r_ui = iids[i - 1:i].rate.values[0]
        pred = algo.predict(u, iid, r_ui, verbose = False)
        pred_rating_z[int(u)][int(iid)] = pred.est

In [20]:
pred_rating_z

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 4.81213531, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.45778811, 0.        , ..., 0.        , 0.        ,
        0.        ]])

## SVD

In [21]:
from surprise import SVD

In [22]:
algo = SVD(n_factors = 100, n_epochs = 20, biased = False, lr_all = 0.005, reg_all = 0)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x124b59048>

In [23]:
pred_rating_svd = np.zeros((max(user_i) + 1, max(item_i) + 1))
for u in user:
    iids = df[df.uid == u]
    for i in range(1, len(iids) + 1):
        iid = iids[i - 1:i].iid.values[0]
        r_ui = iids[i - 1:i].rate.values[0]
        pred = algo.predict(u, iid, r_ui, verbose = False)
        pred_rating_svd[int(u)][int(iid)] = pred.est

In [24]:
pred_rating_svd

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 4.72075285, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.05541747, 0.        , ..., 0.        , 0.        ,
        0.        ]])

## PMF

In [25]:
from surprise import SVD

In [26]:
algo = SVD(n_factors = 100, n_epochs = 20, biased = False, lr_all = 0.005, reg_all = 0.02)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x124ae3128>

In [27]:
pred_rating_pmf = np.zeros((max(user_i) + 1, max(item_i) + 1))
for u in user:
    iids = df[df.uid == u]
    for i in range(1, len(iids) + 1):
        iid = iids[i - 1:i].iid.values[0]
        r_ui = iids[i - 1:i].rate.values[0]
        pred = algo.predict(u, iid, r_ui, verbose = False)
        pred_rating_pmf[int(u)][int(iid)] = pred.est

In [28]:
pred_rating_pmf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 4.72957881, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.42034845, 0.        , ..., 0.        , 0.        ,
        0.        ]])

## PMF with biased

In [29]:
from surprise import SVD

In [30]:
algo = SVD(n_factors = 100, n_epochs = 20, biased = True, lr_all = 0.005, reg_all = 0.02)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x124ae38d0>

In [31]:
pred_rating_pmf_bi = np.zeros((max(user_i) + 1, max(item_i) + 1))
for u in user:
    iids = df[df.uid == u]
    for i in range(1, len(iids) + 1):
        iid = iids[i - 1:i].iid.values[0]
        r_ui = iids[i - 1:i].rate.values[0]
        pred = algo.predict(u, iid, r_ui, verbose = False)
        pred_rating_pmf_bi[int(u)][int(iid)] = pred.est

In [32]:
pred_rating_pmf_bi

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 4.49035194, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.64984629, 0.        , ..., 0.        , 0.        ,
        0.        ]])