# 3. Collaborative Filtering (협업 필터링: 사용자 리뷰 기반)

In [1]:
import surprise
surprise.__version__

'1.1.3'

In [2]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [3]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
ratings['rating'].min(), ratings['rating'].max()

(0.5, 5.0)

In [6]:
# surprise가 읽을 수 있는 리더 객체를 만들어야 함 // 평점 (최소,최대) 세팅
reader = Reader(rating_scale=(0.5, 5.0))
# DF는 user; item; rating; 로 3개 컬럼만 있어야 함
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)

In [7]:
svd = SVD(random_state=0)

```
교차 검증 (K-Fold cross validate)

100개 데이터

A: 1-20
B: 21-40
C: 41-60
D: 61-80
E: 81-100

ABCD (train set) E (test set)
ABCE (train set) D (test set)
ABDE (train set) C (test set)
ACDE (train set) B (test set)
BCDE (train set) A (test set)
```

In [8]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8977  0.8940  0.9028  0.9006  0.8891  0.8968  0.0049  
MAE (testset)     0.6899  0.6899  0.6961  0.6942  0.6867  0.6914  0.0033  
Fit time          1.15    1.05    1.06    0.78    0.76    0.96    0.16    
Test time         0.11    0.22    0.10    0.08    0.08    0.11    0.05    


{'test_rmse': array([0.89773986, 0.8940433 , 0.90276657, 0.9006248 , 0.8890606 ]),
 'test_mae': array([0.68994686, 0.68991767, 0.69606438, 0.69423745, 0.6867167 ]),
 'fit_time': (1.1466021537780762,
  1.0493242740631104,
  1.0608766078948975,
  0.7802996635437012,
  0.7640466690063477),
 'test_time': (0.10663938522338867,
  0.21619272232055664,
  0.0950467586517334,
  0.07706999778747559,
  0.07937479019165039)}

In [9]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2a9d02b8f40>

In [10]:
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [11]:
svd.predict(1, 302) # userId, 평가할 영화Id, [실제 점수]
# est가 예측 평점, r_ui : 실제 점수

Prediction(uid=1, iid=302, r_ui=None, est=2.7142061734434044, details={'was_impossible': False})

In [12]:
svd.predict(1, 1029, 3.0)

Prediction(uid=1, iid=1029, r_ui=3.0, est=2.8814455446761933, details={'was_impossible': False})