In [2]:
!pip install scikit-surprise



In [1]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [2]:
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25,random_state = 0)

In [3]:
data.raw_ratings[:10]

[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [4]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2492e5eefa0>

In [5]:
predictions = algo.test(testset)
print(type(predictions), len(predictions))
predictions[:5]

<class 'list'> 25000


[Prediction(uid='120', iid='282', r_ui=4.0, est=3.6049062275230015, details={'was_impossible': False}),
 Prediction(uid='882', iid='291', r_ui=4.0, est=3.6501951173755085, details={'was_impossible': False}),
 Prediction(uid='535', iid='507', r_ui=5.0, est=3.9317957780623307, details={'was_impossible': False}),
 Prediction(uid='697', iid='244', r_ui=5.0, est=3.769307033790527, details={'was_impossible': False}),
 Prediction(uid='751', iid='385', r_ui=4.0, est=3.2549098488133796, details={'was_impossible': False})]

In [6]:
[(pred.uid, pred.iid, pred.est) for pred in predictions[:3]]

[('120', '282', 3.6049062275230015),
 ('882', '291', 3.6501951173755085),
 ('535', '507', 3.9317957780623307)]

In [7]:
uid = str(196)
iid = str(302)
pred = algo.predict(uid,iid)
print(pred)

user: 196        item: 302        r_ui = None   est = 4.20   {'was_impossible': False}


In [8]:
accuracy.rmse(predictions)

RMSE: 0.9459


0.9458933819943701

In [9]:
import pandas as pd

ratings = pd.read_csv('./dataset/ml-latest-small/ratings.csv')
ratings.to_csv('./dataset/ml-latest-small/ratings_noh.csv',index=False,header=False)

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
ratings = pd.read_csv('./dataset/ml-latest-small/ratings_noh.csv')

In [11]:
ratings

Unnamed: 0,1,1.1,4.0,964982703
0,1,3,4.0,964981247
1,1,6,4.0,964982224
2,1,47,5.0,964983815
3,1,50,5.0,964982931
4,1,70,3.0,964982400
...,...,...,...,...
100830,610,166534,4.0,1493848402
100831,610,168248,5.0,1493850091
100832,610,168250,5.0,1494273047
100833,610,168252,5.0,1493846352


In [12]:
from surprise import Reader

reader = Reader(line_format='user item rating timestamp',sep = ',', rating_scale=(0.5,5))
data = Dataset.load_from_file('./dataset/ml-latest-small/ratings_noh.csv',reader=reader)

In [13]:
trainset, testset = train_test_split(data, test_size=.25,random_state = 0)

algo = SVD(n_factors=50,random_state=0)

algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8682


0.8681952927143516

[과제] ratings.csv 데이터로 SVD 클래스를 이용하여 아래와 같이 추천알고리즘을 빌딩하고 평가하세요.
- 교차 검증 및 하이퍼파라미터 튜닝 : measures(RMSE, MAE), cv = 5, param = n_epoch(20,40,60), n_factors(50,100,200)

In [15]:
from surprise.model_selection import cross_validate

ratings = pd.read_csv('./dataset/ml-latest-small/ratings.csv')
reader = Reader(rating_scale=(0.5,5.0))
data = Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)

algo = SVD(random_state=0)
cross_validate(algo,data,measures=['RMSE','MAE'],cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8716  0.8816  0.8611  0.8683  0.8741  0.8713  0.0067  
MAE (testset)     0.6667  0.6783  0.6629  0.6675  0.6727  0.6696  0.0054  
Fit time          3.57    3.56    3.62    3.62    3.63    3.60    0.03    
Test time         0.12    0.20    0.11    0.12    0.11    0.13    0.03    


{'test_rmse': array([0.87158727, 0.88161847, 0.86111541, 0.86825532, 0.87412007]),
 'test_mae': array([0.66671747, 0.67828012, 0.66285431, 0.66748086, 0.67274269]),
 'fit_time': (3.5728042125701904,
  3.5598011016845703,
  3.6228158473968506,
  3.622281074523926,
  3.6265382766723633),
 'test_time': (0.11502623558044434,
  0.1960444450378418,
  0.11302542686462402,
  0.11602640151977539,
  0.11202573776245117)}

In [17]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs':[20,40,60],'n_factors':[50,100,200]}

gs = GridSearchCV(SVD,param_grid,measures = ['RMSE','MAE'],cv=5)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8712104636705511
{'n_epochs': 20, 'n_factors': 50}
