# 모듈 불러오기

In [1]:
import mariadb
import sys
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate 
from surprise.model_selection import GridSearchCV

# Movies, Ratings 데이터 불러오기 (MovieLens)

In [149]:
# 데이터 불러오기 (데이터 프레임)
ratings = pd.read_csv('./ml-25m/ratings.csv')
links = pd.read_csv('./ml-25m/links.csv')
movies = pd.read_csv('./ml-25m/movies.csv')

In [150]:
links = links.fillna(-1)

In [151]:
links = links.astype({'tmdbId':'int'})

In [152]:
ratings_combined = pd.merge(links, ratings, on='movieId')

In [153]:
ratings_combined

Unnamed: 0,movieId,imdbId,tmdbId,userId,rating,timestamp
0,1,114709,862,2,3.5,1141415820
1,1,114709,862,3,4.0,1439472215
2,1,114709,862,4,3.0,1573944252
3,1,114709,862,5,4.0,858625949
4,1,114709,862,8,4.0,890492517
...,...,...,...,...,...,...
25000090,209157,6671244,499546,119571,1.5,1574280748
25000091,209159,297986,63407,115835,3.0,1574280985
25000092,209163,6755366,553036,6964,4.5,1574284913
25000093,209169,249603,162892,119571,3.0,1574291826


In [154]:
ratings_combined.drop(['movieId', 'imdbId', 'timestamp'], axis=1, inplace=True)

In [155]:
ratings_combined

Unnamed: 0,tmdbId,userId,rating
0,862,2,3.5
1,862,3,4.0
2,862,4,3.0
3,862,5,4.0
4,862,8,4.0
...,...,...,...
25000090,499546,119571,1.5
25000091,63407,115835,3.0
25000092,553036,6964,4.5
25000093,162892,119571,3.0


In [156]:
ratings_combined.dtypes

tmdbId      int32
userId      int64
rating    float64
dtype: object

In [157]:
movies_combined = pd.merge(links, movies, on='movieId')

In [158]:
movies_combined

Unnamed: 0,movieId,imdbId,tmdbId,title,genres
0,1,114709,862,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,113497,8844,Jumanji (1995),Adventure|Children|Fantasy
2,3,113228,15602,Grumpier Old Men (1995),Comedy|Romance
3,4,114885,31357,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,113041,11862,Father of the Bride Part II (1995),Comedy
...,...,...,...,...,...
62418,209157,6671244,499546,We (2018),Drama
62419,209159,297986,63407,Window of the Soul (2001),Documentary
62420,209163,6755366,553036,Bad Poems (2018),Comedy|Drama
62421,209169,249603,162892,A Girl Thing (2001),(no genres listed)


In [159]:
movies_combined.drop(['movieId', 'imdbId', 'genres'], axis=1, inplace=True)

In [160]:
movies_combined

Unnamed: 0,tmdbId,title
0,862,Toy Story (1995)
1,8844,Jumanji (1995)
2,15602,Grumpier Old Men (1995)
3,31357,Waiting to Exhale (1995)
4,11862,Father of the Bride Part II (1995)
...,...,...
62418,499546,We (2018)
62419,63407,Window of the Soul (2001)
62420,553036,Bad Poems (2018)
62421,162892,A Girl Thing (2001)


In [161]:
movies_combined.dtypes

tmdbId     int32
title     object
dtype: object

# Surprise SVD

In [162]:
# Reader 객체 생성
reader = Reader(rating_scale=(0.5, 5.0))

surprise_data = Dataset.load_from_df(ratings_combined[['userId', 'tmdbId', 'rating']], reader)

# surprise의 train_test_split() 사용. trainset : testset = 3 : 1
trainset, testset = train_test_split(surprise_data, test_size=0.25, random_state=0)

In [163]:
algo = SVD() # GridSearchCV를 이용한 최적 하이퍼 파라미터 적용 필요
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x94be520>

In [164]:
# 사용자 아이디(uid), 아이템 아이디(iid)는 문자열로 입력
uid = str(196)
iid = str(302)

# 추천 예측 평점 (.predict)
pred = algo.predict(uid, iid)
pred

Prediction(uid='196', iid='302', r_ui=None, est=3.533832591887252, details={'was_impossible': False})

In [165]:
# 추천 예측 평점 (.test)
predictions = algo.test( testset )

print('prediction type :',type(predictions), ' size:',len(predictions))
print('prediction 결과의 최초 5개 추출')

predictions[:5]

prediction type : <class 'list'>  size: 6250024
prediction 결과의 최초 5개 추출


[Prediction(uid=145355, iid=91679, r_ui=4.5, est=3.3946251030941275, details={'was_impossible': False}),
 Prediction(uid=86805, iid=141, r_ui=3.0, est=3.61579035866443, details={'was_impossible': False}),
 Prediction(uid=9739, iid=424, r_ui=3.0, est=4.0202023160040525, details={'was_impossible': False}),
 Prediction(uid=144322, iid=27303, r_ui=2.5, est=2.8130119348396625, details={'was_impossible': False}),
 Prediction(uid=53896, iid=117, r_ui=4.0, est=4.374212993285064, details={'was_impossible': False})]

In [166]:
# 속성 확인
[ (pred.uid, pred.iid, pred.est, pred.details) for pred in predictions[:3] ]

[(145355, 91679, 3.3946251030941275, {'was_impossible': False}),
 (86805, 141, 3.61579035866443, {'was_impossible': False}),
 (9739, 424, 4.0202023160040525, {'was_impossible': False})]

# 성능 평가

In [167]:
# 성능 평가
accuracy.rmse(predictions)

RMSE: 0.7809


0.7808989569302095

# 최적 파라미터 탐색

In [13]:
# n_epochs: SGD 수행 시 반복 횟수, n_factors: 잠재 요인 크기
param_grid = {
    'n_epochs': [20, 40, 60], 
    'n_factors': [50, 100, 200]
}

# GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) # algo가 아닌 SVD 입력하였다.
gs.fit(surprise_data)

# 최적 하이퍼 파라미터 및 그 때의 최고 성능
print(gs.best_params['rmse'])
print(gs.best_score['rmse'])

{'n_epochs': 20, 'n_factors': 50}
0.876436307682205


# 예상 평점 기반 영화 추천

In [168]:
# 아직 보지 않은 영화 리스트 함수
def get_unseen_surprise(ratings, movies, userId):
     # 특정 userId가 평점을 매긴 모든 영화 리스트
    seen_movies = ratings[ratings['userId']== userId]['tmdbId'].tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    total_movies = movies['tmdbId'].tolist()
      
    # 한줄 for + if문으로 안 본 영화 리스트 생성
    unseen_movies = [ movie for movie in total_movies if movie not in seen_movies]
    
    # 일부 정보 출력
    total_movie_cnt = len(total_movies)
    seen_cnt = len(seen_movies)
    unseen_cnt = len(unseen_movies)
    
    print(f"전체 영화 수: {total_movie_cnt}, 평점 매긴 영화 수: {seen_cnt}, 추천 대상 영화 수: {unseen_cnt}")
    
    return unseen_movies

In [169]:
# 예상 평점 기반 영화 리스트 추천 함수
def recomm_movie_by_surprise(algo, userId, unseen_movies, top_n=10):
    
    # 아직 보지 않은 영화의 예측 평점: prediction 객체 생성
    predictions = []    
    for tmdbId in unseen_movies:
        predictions.append(algo.predict(str(userId), tmdbId))
    
    # 리스트 내의 prediction 객체의 est를 기준으로 내림차순 정렬
    def sortkey_est(pred):
        return pred.est

    predictions.sort(key=sortkey_est, reverse=True) # key에 리스트 내 객체의 정렬 기준을 입력
    
    # 상위 top_n개의 prediction 객체
    top_predictions = predictions[:top_n]
    
    # 영화 아이디, 제목, 예측 평점 출력
    print(f"Top-{top_n} 추천 영화 리스트")
    
    for pred in top_predictions:
        
        movie_id = pred.iid
        movie_title = movies_combined[movies_combined["tmdbId"] == movie_id]["title"].tolist()
        movie_rating = pred.est
        
        print(f"{movie_id} - {movie_title} : {movie_rating:.2f}")

In [176]:
unseen_movies = get_unseen_surprise(ratings_combined, movies_combined, 523)

전체 영화 수: 62423, 평점 매긴 영화 수: 205, 추천 대상 영화 수: 62218


In [177]:
recomm_movie_by_surprise(algo, 523, unseen_movies, top_n=20)

Top-20 추천 영화 리스트
420714 - ['Planet Earth II (2016)'] : 4.49
192040 - ['Planet Earth (2006)'] : 4.42
220903 - ['Life (2009)'] : 4.40
331214 - ['Band of Brothers (2001)'] : 4.37
409926 - ['Cosmos'] : 4.36
206514 - ['I, Claudius (1976)'] : 4.34
200813 - ['The Blue Planet (2001)'] : 4.34
488705 - ['Cosmos: A Spacetime Odissey'] : 4.33
72095 - ['Sherlock Holmes and Dr. Watson: Acquaintance (1979)'] : 4.29
71317 - ['Hamlet (Gamlet) (1964)'] : 4.29
436305 - ['The Farthest (2017)'] : 4.29
30 - ['Magnetic Rose (1995)'] : 4.28
367647 - ['Connections (1978)'] : 4.28
244049 - ['Drishyam (2013)'] : 4.28
269981 - ['Twelve Angry Men (1954)'] : 4.28
419787 - ['Death Note: Desu nôto (2006–2007)'] : 4.27
409696 - ['Over the Garden Wall (2013)'] : 4.27
129 - ['Spirited Away (Sen to Chihiro no kamikakushi) (2001)'] : 4.27
65992 - ['How to Die in Oregon (2011)'] : 4.27
463612 - ['Blue Planet II (2017)'] : 4.26
