In [4]:
import mariadb
import sys
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate 
from surprise.model_selection import GridSearchCV

In [5]:
# 데이터 불러오기 (데이터 프레임)
ratings = pd.read_csv('./ml-25m/ratings.csv')
links = pd.read_csv('./ml-25m/links.csv')
movies = pd.read_csv('./ml-25m/movies.csv')

In [6]:
ratings_combined = pd.merge(movies, ratings, on='movieId')
ratings_combined = pd.merge(links, ratings_combined, on='movieId')

In [7]:
ratings_combined

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,userId,rating,timestamp
0,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5,1141415820
1,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1439472215
2,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0,1573944252
3,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,858625949
4,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,890492517
...,...,...,...,...,...,...,...,...
25000090,209157,6671244,499546.0,We (2018),Drama,119571,1.5,1574280748
25000091,209159,297986,63407.0,Window of the Soul (2001),Documentary,115835,3.0,1574280985
25000092,209163,6755366,553036.0,Bad Poems (2018),Comedy|Drama,6964,4.5,1574284913
25000093,209169,249603,162892.0,A Girl Thing (2001),(no genres listed),119571,3.0,1574291826


In [8]:
ratings_combined.drop(['movieId', 'imdbId', 'genres', 'timestamp'], axis=1, inplace=True)

In [9]:
ratings_combined

Unnamed: 0,tmdbId,title,userId,rating
0,862.0,Toy Story (1995),2,3.5
1,862.0,Toy Story (1995),3,4.0
2,862.0,Toy Story (1995),4,3.0
3,862.0,Toy Story (1995),5,4.0
4,862.0,Toy Story (1995),8,4.0
...,...,...,...,...
25000090,499546.0,We (2018),119571,1.5
25000091,63407.0,Window of the Soul (2001),115835,3.0
25000092,553036.0,Bad Poems (2018),6964,4.5
25000093,162892.0,A Girl Thing (2001),119571,3.0


In [11]:
# Reader 객체 생성
reader = Reader(rating_scale=(0.5, 5.0))

surprise_data = Dataset.load_from_df(ratings_combined[['userId', 'tmdbId', 'rating']], reader)

# surprise의 train_test_split() 사용. trainset : testset = 3 : 1
trainset, testset = train_test_split(surprise_data, test_size=0.25, random_state=0)

In [12]:
algo = SVD() # GridSearchCV를 이용한 최적 하이퍼 파라미터 적용 필요
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0xf41a60>

In [13]:
# 사용자 아이디(uid), 아이템 아이디(iid)는 문자열로 입력
uid = str(196)
iid = str(302)

# 추천 예측 평점 (.predict)
pred = algo.predict(uid, iid)
pred

Prediction(uid='196', iid='302', r_ui=None, est=3.533832591887252, details={'was_impossible': False})

In [14]:
# 추천 예측 평점 (.test)
predictions = algo.test( testset )

print('prediction type :',type(predictions), ' size:',len(predictions))
print('prediction 결과의 최초 5개 추출')

predictions[:5]

prediction type : <class 'list'>  size: 6250024
prediction 결과의 최초 5개 추출


[Prediction(uid=145355, iid=91679.0, r_ui=4.5, est=3.4494802560974445, details={'was_impossible': False}),
 Prediction(uid=86805, iid=141.0, r_ui=3.0, est=3.7153861903789687, details={'was_impossible': False}),
 Prediction(uid=9739, iid=424.0, r_ui=3.0, est=4.093207445696001, details={'was_impossible': False}),
 Prediction(uid=144322, iid=27303.0, r_ui=2.5, est=2.880131131506351, details={'was_impossible': False}),
 Prediction(uid=53896, iid=117.0, r_ui=4.0, est=4.4155083007657625, details={'was_impossible': False})]

In [15]:
# 속성 확인
[ (pred.uid, pred.iid, pred.est, pred.details) for pred in predictions[:3] ]

[(145355, 91679.0, 3.4494802560974445, {'was_impossible': False}),
 (86805, 141.0, 3.7153861903789687, {'was_impossible': False}),
 (9739, 424.0, 4.093207445696001, {'was_impossible': False})]

In [16]:
# 성능 평가
accuracy.rmse(predictions)

RMSE: 0.7813


0.7812640101833067

In [None]:
# n_epochs: SGD 수행 시 반복 횟수, n_factors: 잠재 요인 크기
param_grid = {
    'n_epochs': [20, 40, 60], 
    'n_factors': [50, 100, 200]
}

# GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) # algo가 아닌 SVD 입력하였다.
gs.fit(surprise_data)

# 최적 하이퍼 파라미터 및 그 때의 최고 성능
print(gs.best_params['rmse'])
print(gs.best_score['rmse'])