## 3. Collaborative Filtering (협업 필터링 : 사용자 리뷰 기반)

In [1]:
import surprise
surprise.__version__

'1.1.1'

In [2]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [4]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
ratings['rating'].min()

0.5

In [6]:
ratings['rating'].max()

5.0

In [7]:
# 평가 범위 맞추기
reader = Reader(rating_scale=(0.5, 5))

In [8]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)
data

<surprise.dataset.DatasetAutoFolds at 0x19187daa610>

In [9]:
svd = SVD(random_state=0)

In [10]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8991  0.8897  0.9008  0.8966  0.9015  0.8976  0.0043  
MAE (testset)     0.6923  0.6842  0.6927  0.6901  0.6953  0.6909  0.0037  
Fit time          9.90    9.16    7.29    7.30    8.02    8.33    1.04    
Test time         0.21    0.27    0.17    0.29    0.19    0.23    0.04    


{'test_rmse': array([0.89909857, 0.88973499, 0.90084798, 0.896631  , 0.90148935]),
 'test_mae': array([0.69225358, 0.6841512 , 0.69265961, 0.69014311, 0.69525324]),
 'fit_time': (9.899417877197266,
  9.156165838241577,
  7.290391445159912,
  7.303405523300171,
  8.017727851867676),
 'test_time': (0.21367502212524414,
  0.26520252227783203,
  0.17473292350769043,
  0.29048800468444824,
  0.19388628005981445)}

In [11]:
# 전체 데이터 학습하기
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x19187da53a0>

In [12]:
# userId가 1인 사람의 리뷰 기록
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [13]:
# predict(userId, movieId) -> 입력한 유저가 입력한 영화의 평가 점수 예측
svd.predict(1, 302)
# r_ui : 실제 평가한 점수

Prediction(uid=1, iid=302, r_ui=None, est=2.7142061734434044, details={'was_impossible': False})

In [14]:
# userId =1 인 사람이 movieId=1029인 영화에 대해서 실제 평가 점수가 3일 때, 예측 평가 점수?
svd.predict(1, 1029, 3)

Prediction(uid=1, iid=1029, r_ui=3, est=2.8814455446761933, details={'was_impossible': False})

In [15]:
ratings[ratings['userId']==100]

Unnamed: 0,userId,movieId,rating,timestamp
15273,100,1,4.0,854193977
15274,100,3,4.0,854194024
15275,100,6,3.0,854194023
15276,100,7,3.0,854194024
15277,100,25,4.0,854193977
15278,100,32,5.0,854193977
15279,100,52,3.0,854194056
15280,100,62,3.0,854193977
15281,100,86,3.0,854194208
15282,100,88,2.0,854194208


In [16]:
svd.predict(100, 1029)

Prediction(uid=100, iid=1029, r_ui=None, est=3.7705476478414846, details={'was_impossible': False})