# 모듈 불러오기

In [102]:
import mariadb
import sys
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate 
from surprise.model_selection import GridSearchCV
from surprise.dataset import DatasetAutoFolds

# Movies, Ratings 데이터 불러오기 (MovieLens)

In [134]:
# 데이터 불러오기 (데이터 프레임)
# 다운로드 링크 : https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
links = pd.read_csv('./ml-latest-small/links.csv')
movies = pd.read_csv('./ml-latest-small/movies.csv')

In [135]:
links = links.fillna(0)

In [136]:
links = links.astype({'tmdbId':'int64'})

In [149]:
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862
...,...,...,...
9737,193581,5476944,432131
9738,193583,5914996,445030
9739,193585,6397426,479308
9740,193587,8391976,483455


In [137]:
links[links['tmdbId'] == 0]

Unnamed: 0,movieId,imdbId,tmdbId
624,791,113610,0
843,1107,102336,0
2141,2851,81454,0
3027,4051,56600,0
5532,26587,92337,0
5854,32600,377059,0
6059,40697,105946,0
7382,79299,874957,0


In [138]:
links.dtypes

movieId    int64
imdbId     int64
tmdbId     int64
dtype: object

In [139]:
ratings_combined = pd.merge(links, ratings, on='movieId')

In [140]:
ratings_combined

Unnamed: 0,movieId,imdbId,tmdbId,userId,rating,timestamp
0,1,114709,862,1,4.0,964982703
1,1,114709,862,5,4.0,847434962
2,1,114709,862,7,4.5,1106635946
3,1,114709,862,15,2.5,1510577970
4,1,114709,862,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,5476944,432131,184,4.0,1537109082
100832,193583,5914996,445030,184,3.5,1537109545
100833,193585,6397426,479308,184,3.5,1537109805
100834,193587,8391976,483455,184,3.5,1537110021


In [141]:
ratings_combined.drop(['imdbId', 'timestamp'], axis=1, inplace=True)

In [142]:
ratings_combined

Unnamed: 0,movieId,tmdbId,userId,rating
0,1,862,1,4.0
1,1,862,5,4.0
2,1,862,7,4.5
3,1,862,15,2.5
4,1,862,17,4.5
...,...,...,...,...
100831,193581,432131,184,4.0
100832,193583,445030,184,3.5
100833,193585,479308,184,3.5
100834,193587,483455,184,3.5


In [143]:
ratings_combined.dtypes

movieId      int64
tmdbId       int64
userId       int64
rating     float64
dtype: object

In [151]:
ratings_combined[ratings_combined['tmdbId'] == 0]

Unnamed: 0,movieId,tmdbId,userId,rating
19337,791,0,90,4.0
23779,1107,0,160,3.5
23780,1107,0,298,0.5
48623,2851,0,3,5.0
48624,2851,0,217,3.0
48625,2851,0,288,2.0
48626,2851,0,469,3.0
60017,4051,0,448,0.5
78156,26587,0,105,5.0
79567,32600,0,606,3.5


In [144]:
movies_combined = pd.merge(links, movies, on='movieId')

In [145]:
movies_combined

Unnamed: 0,movieId,imdbId,tmdbId,title,genres
0,1,114709,862,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,113497,8844,Jumanji (1995),Adventure|Children|Fantasy
2,3,113228,15602,Grumpier Old Men (1995),Comedy|Romance
3,4,114885,31357,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,113041,11862,Father of the Bride Part II (1995),Comedy
...,...,...,...,...,...
9737,193581,5476944,432131,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,5914996,445030,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,6397426,479308,Flint (2017),Drama
9740,193587,8391976,483455,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [146]:
movies_combined.drop(['imdbId', 'genres'], axis=1, inplace=True)

In [147]:
movies_combined

Unnamed: 0,movieId,tmdbId,title
0,1,862,Toy Story (1995)
1,2,8844,Jumanji (1995)
2,3,15602,Grumpier Old Men (1995)
3,4,31357,Waiting to Exhale (1995)
4,5,11862,Father of the Bride Part II (1995)
...,...,...,...
9737,193581,432131,Black Butler: Book of the Atlantic (2017)
9738,193583,445030,No Game No Life: Zero (2017)
9739,193585,479308,Flint (2017)
9740,193587,483455,Bungo Stray Dogs: Dead Apple (2018)


In [148]:
movies_combined.dtypes

movieId     int64
tmdbId      int64
title      object
dtype: object

In [150]:
movies_combined[movies_combined['tmdbId'] == 0]

Unnamed: 0,movieId,tmdbId,title
624,791,0,"Last Klezmer: Leopold Kozlowski, His Life and ..."
843,1107,0,Loser (1991)
2141,2851,0,Saturn 3 (1980)
3027,4051,0,Horrors of Spider Island (Ein Toter Hing im Ne...
5532,26587,0,"Decalogue, The (Dekalog) (1989)"
5854,32600,0,Eros (2004)
6059,40697,0,Babylon 5
7382,79299,0,"No. 1 Ladies' Detective Agency, The (2008)"


# Surprise SVD

In [109]:
# Reader 객체 생성
# reader = Reader(rating_scale=(0.5, 5))

# surprise_data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# surprise의 train_test_split() 사용. trainset : testset = 3 : 1
# trainset, testset = train_test_split(surprise_data, test_size=0.25, random_state=0)

#-----------------------------------------------------------------------------------------------#

# Reader 객체 생성
reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5))

# DatasetAutoFolds 객체 생성 (index 제거한 ratings_noh 파일 사용)
data_folds = DatasetAutoFolds(ratings_file='./ml-latest-small/ratings_noh.csv', reader=reader)

# 전체 데이터를 trainset으로 지정
trainset = data_folds.build_full_trainset()
trainset

<surprise.trainset.Trainset at 0x86a2dc0>

In [110]:
algo = SVD() # GridSearchCV를 이용한 최적 하이퍼 파라미터 적용 필요
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x86a27c0>

In [111]:
# 사용자 아이디(uid), 아이템 아이디(iid)는 문자열로 입력
uid = str(196)
iid = str(302)

# 추천 예측 평점 (.predict)
pred = algo.predict(uid, iid)
pred

Prediction(uid='196', iid='302', r_ui=None, est=3.399905796578076, details={'was_impossible': False})

In [113]:
# 속성 확인
[ (pred.uid, pred.iid, pred.est, pred.details) for pred in predictions[:3] ]

[(63, 2000, 3.501556983616962, {'was_impossible': False}),
 (31, 788, 3.501556983616962, {'was_impossible': False}),
 (159, 6373, 3.501556983616962, {'was_impossible': False})]

# 성능 평가

In [114]:
# 성능 평가
accuracy.rmse(predictions)

RMSE: 1.0388


1.0387802469854106

# 예상 평점 기반 영화 추천

In [115]:
# 아직 보지 않은 영화 리스트 함수
def get_unseen_surprise(ratings, movies, userId):
     # 특정 userId가 평점을 매긴 모든 영화 리스트
    seen_movies = ratings[ratings['userId']== userId]['movieId'].tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    total_movies = movies['movieId'].tolist()
      
    # 한줄 for + if문으로 안 본 영화 리스트 생성
    unseen_movies = [ movie for movie in total_movies if movie not in seen_movies]
    
    # 일부 정보 출력
    total_movie_cnt = len(total_movies)
    seen_cnt = len(seen_movies)
    unseen_cnt = len(unseen_movies)
    
    print(f"전체 영화 수: {total_movie_cnt}, 평점 매긴 영화 수: {seen_cnt}, 추천 대상 영화 수: {unseen_cnt}")
    
    return unseen_movies

In [155]:
# 예상 평점 기반 영화 리스트 추천 함수
def recomm_movie_by_surprise(algo, userId, unseen_movies, top_n=10):
    
    # 아직 보지 않은 영화의 예측 평점: prediction 객체 생성
    predictions = []    
    for movieId in unseen_movies:
        predictions.append(algo.predict(str(userId), str(movieId)))
    
    # 리스트 내의 prediction 객체의 est를 기준으로 내림차순 정렬
    def sortkey_est(pred):
        return pred.est

    predictions.sort(key=sortkey_est, reverse=True) # key에 리스트 내 객체의 정렬 기준을 입력
    
    # 상위 top_n개의 prediction 객체
    top_predictions = predictions[:top_n]
    
    # 영화 아이디, 제목, 예측 평점 출력
    print(f"Top-{top_n} 추천 영화 리스트")
    
    for pred in top_predictions:
        
        movie_id = int(pred.iid)
        movie_title = movies[movies["movieId"] == movie_id]["title"].tolist()
        movie_rating = pred.est
        
        print(f"{movie_id} - {movie_title}: {movie_rating:.2f}")

In [156]:
unseen_movies = get_unseen_surprise(ratings, movies, 234)

전체 영화 수: 9742, 평점 매긴 영화 수: 202, 추천 대상 영화 수: 9540


In [157]:
recomm_movie_by_surprise(algo, 234, unseen_movies, top_n=20)

Top-20 추천 영화 리스트
296 - ['Pulp Fiction (1994)']: 4.83
527 - ["Schindler's List (1993)"]: 4.83
1204 - ['Lawrence of Arabia (1962)']: 4.80
750 - ['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)']: 4.74
1217 - ['Ran (1985)']: 4.73
150 - ['Apollo 13 (1995)']: 4.67
33166 - ['Crash (2004)']: 4.65
68157 - ['Inglourious Basterds (2009)']: 4.63
318 - ['Shawshank Redemption, The (1994)']: 4.63
1178 - ['Paths of Glory (1957)']: 4.62
541 - ['Blade Runner (1982)']: 4.61
1104 - ['Streetcar Named Desire, A (1951)']: 4.61
1199 - ['Brazil (1985)']: 4.60
1197 - ['Princess Bride, The (1987)']: 4.59
1921 - ['Pi (1998)']: 4.59
1213 - ['Goodfellas (1990)']: 4.58
4973 - ["Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)"]: 4.58
858 - ['Godfather, The (1972)']: 4.57
1704 - ['Good Will Hunting (1997)']: 4.54
55442 - ['Persepolis (2007)']: 4.53
