# 모듈 불러오기

In [2]:
import mariadb
import sys
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate 
from surprise.model_selection import GridSearchCV
from surprise.dataset import DatasetAutoFolds

# Movies, Ratings 데이터 불러오기 (MovieLens)

In [3]:
# 데이터 불러오기 (데이터 프레임)
# 다운로드 링크 : https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
links = pd.read_csv('./ml-latest-small/links.csv')
movies = pd.read_csv('./ml-latest-small/movies.csv')

In [4]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [5]:
links = links.fillna(0)

In [6]:
links = links.astype({'tmdbId':'int64'})

In [7]:
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862
...,...,...,...
9737,193581,5476944,432131
9738,193583,5914996,445030
9739,193585,6397426,479308
9740,193587,8391976,483455


In [8]:
links[links['tmdbId'] == 0]

Unnamed: 0,movieId,imdbId,tmdbId
624,791,113610,0
843,1107,102336,0
2141,2851,81454,0
3027,4051,56600,0
5532,26587,92337,0
5854,32600,377059,0
6059,40697,105946,0
7382,79299,874957,0


In [9]:
links.dtypes

movieId    int64
imdbId     int64
tmdbId     int64
dtype: object

In [10]:
ratings_combined = pd.merge(links, ratings, on='movieId')

In [11]:
ratings_combined

Unnamed: 0,movieId,imdbId,tmdbId,userId,rating,timestamp
0,1,114709,862,1,4.0,964982703
1,1,114709,862,5,4.0,847434962
2,1,114709,862,7,4.5,1106635946
3,1,114709,862,15,2.5,1510577970
4,1,114709,862,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,5476944,432131,184,4.0,1537109082
100832,193583,5914996,445030,184,3.5,1537109545
100833,193585,6397426,479308,184,3.5,1537109805
100834,193587,8391976,483455,184,3.5,1537110021


In [12]:
ratings_combined.drop(['imdbId', 'timestamp'], axis=1, inplace=True)

In [13]:
ratings_combined

Unnamed: 0,movieId,tmdbId,userId,rating
0,1,862,1,4.0
1,1,862,5,4.0
2,1,862,7,4.5
3,1,862,15,2.5
4,1,862,17,4.5
...,...,...,...,...
100831,193581,432131,184,4.0
100832,193583,445030,184,3.5
100833,193585,479308,184,3.5
100834,193587,483455,184,3.5


In [14]:
ratings_combined.dtypes

movieId      int64
tmdbId       int64
userId       int64
rating     float64
dtype: object

In [15]:
ratings_combined[ratings_combined['tmdbId'] == 0]

Unnamed: 0,movieId,tmdbId,userId,rating
19337,791,0,90,4.0
23779,1107,0,160,3.5
23780,1107,0,298,0.5
48623,2851,0,3,5.0
48624,2851,0,217,3.0
48625,2851,0,288,2.0
48626,2851,0,469,3.0
60017,4051,0,448,0.5
78156,26587,0,105,5.0
79567,32600,0,606,3.5


In [16]:
movies_combined = pd.merge(links, movies, on='movieId')

In [17]:
movies_combined

Unnamed: 0,movieId,imdbId,tmdbId,title,genres
0,1,114709,862,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,113497,8844,Jumanji (1995),Adventure|Children|Fantasy
2,3,113228,15602,Grumpier Old Men (1995),Comedy|Romance
3,4,114885,31357,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,113041,11862,Father of the Bride Part II (1995),Comedy
...,...,...,...,...,...
9737,193581,5476944,432131,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,5914996,445030,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,6397426,479308,Flint (2017),Drama
9740,193587,8391976,483455,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [18]:
movies_combined.drop(['imdbId', 'genres'], axis=1, inplace=True)

In [19]:
movies_combined

Unnamed: 0,movieId,tmdbId,title
0,1,862,Toy Story (1995)
1,2,8844,Jumanji (1995)
2,3,15602,Grumpier Old Men (1995)
3,4,31357,Waiting to Exhale (1995)
4,5,11862,Father of the Bride Part II (1995)
...,...,...,...
9737,193581,432131,Black Butler: Book of the Atlantic (2017)
9738,193583,445030,No Game No Life: Zero (2017)
9739,193585,479308,Flint (2017)
9740,193587,483455,Bungo Stray Dogs: Dead Apple (2018)


In [20]:
movies_combined.dtypes

movieId     int64
tmdbId      int64
title      object
dtype: object

In [21]:
movies_combined[movies_combined['tmdbId'] == 0]

Unnamed: 0,movieId,tmdbId,title
624,791,0,"Last Klezmer: Leopold Kozlowski, His Life and ..."
843,1107,0,Loser (1991)
2141,2851,0,Saturn 3 (1980)
3027,4051,0,Horrors of Spider Island (Ein Toter Hing im Ne...
5532,26587,0,"Decalogue, The (Dekalog) (1989)"
5854,32600,0,Eros (2004)
6059,40697,0,Babylon 5
7382,79299,0,"No. 1 Ladies' Detective Agency, The (2008)"


# Surprise SVD

In [22]:
# Reader 객체 생성
# reader = Reader(rating_scale=(0.5, 5))

# surprise_data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# surprise의 train_test_split() 사용. trainset : testset = 3 : 1
# trainset, testset = train_test_split(surprise_data, test_size=0.25, random_state=0)

#-----------------------------------------------------------------------------------------------#

# Reader 객체 생성
reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5))

# DatasetAutoFolds 객체 생성 (index 제거한 ratings_noh 파일 사용)
data_folds = DatasetAutoFolds(ratings_file='./ml-latest-small/ratings_noh.csv', reader=reader)

# 전체 데이터를 trainset으로 지정
trainset = data_folds.build_full_trainset()
trainset

<surprise.trainset.Trainset at 0x93bf970>

In [23]:
testset = trainset.build_testset()
testset

[('1', '1', 4.0),
 ('1', '3', 4.0),
 ('1', '6', 4.0),
 ('1', '47', 5.0),
 ('1', '50', 5.0),
 ('1', '70', 3.0),
 ('1', '101', 5.0),
 ('1', '110', 4.0),
 ('1', '151', 5.0),
 ('1', '157', 5.0),
 ('1', '163', 5.0),
 ('1', '216', 5.0),
 ('1', '223', 3.0),
 ('1', '231', 5.0),
 ('1', '235', 4.0),
 ('1', '260', 5.0),
 ('1', '296', 3.0),
 ('1', '316', 3.0),
 ('1', '333', 5.0),
 ('1', '349', 4.0),
 ('1', '356', 4.0),
 ('1', '362', 5.0),
 ('1', '367', 4.0),
 ('1', '423', 3.0),
 ('1', '441', 4.0),
 ('1', '457', 5.0),
 ('1', '480', 4.0),
 ('1', '500', 3.0),
 ('1', '527', 5.0),
 ('1', '543', 4.0),
 ('1', '552', 4.0),
 ('1', '553', 5.0),
 ('1', '590', 4.0),
 ('1', '592', 4.0),
 ('1', '593', 4.0),
 ('1', '596', 5.0),
 ('1', '608', 5.0),
 ('1', '648', 3.0),
 ('1', '661', 5.0),
 ('1', '673', 3.0),
 ('1', '733', 4.0),
 ('1', '736', 3.0),
 ('1', '780', 3.0),
 ('1', '804', 4.0),
 ('1', '919', 5.0),
 ('1', '923', 5.0),
 ('1', '940', 5.0),
 ('1', '943', 4.0),
 ('1', '954', 5.0),
 ('1', '1009', 3.0),
 ('1', '

In [24]:
# Reader 객체 생성
reader = Reader(rating_scale=(0.5, 5))

surprise_data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# surprise의 train_test_split() 사용. trainset : testset = 3 : 1
trainset, testset = train_test_split(surprise_data, test_size=0.5, random_state=0)

In [25]:
algo = SVD() # GridSearchCV를 이용한 최적 하이퍼 파라미터 적용 필요
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x93fd310>

In [26]:
# 사용자 아이디(uid), 아이템 아이디(iid)는 문자열로 입력
uid = 9
iid = 42

# 추천 예측 평점 (.predict)
pred = algo.predict(uid, iid)
pred

Prediction(uid=9, iid=42, r_ui=None, est=2.995192831344469, details={'was_impossible': False})

In [27]:
# 추천 예측 평점 (.test)
predictions = algo.test(testset)

print('prediction type :',type(predictions), ' size:',len(predictions))
print('prediction 결과의 최초 5개 추출')

predictions[:5]

prediction type : <class 'list'>  size: 50418
prediction 결과의 최초 5개 추출


[Prediction(uid=599, iid=3147, r_ui=3.0, est=3.1641190778699233, details={'was_impossible': False}),
 Prediction(uid=356, iid=3852, r_ui=5.0, est=3.742184771827847, details={'was_impossible': False}),
 Prediction(uid=599, iid=2422, r_ui=2.0, est=2.044783439375324, details={'was_impossible': False}),
 Prediction(uid=274, iid=185, r_ui=3.0, est=3.0786970690049906, details={'was_impossible': False}),
 Prediction(uid=202, iid=1209, r_ui=5.0, est=3.8024042593881093, details={'was_impossible': False})]

In [28]:
# 속성 확인
[ (pred.uid, pred.iid, pred.est, pred.details) for pred in predictions[:3] ]

[(599, 3147, 3.1641190778699233, {'was_impossible': False}),
 (356, 3852, 3.742184771827847, {'was_impossible': False}),
 (599, 2422, 2.044783439375324, {'was_impossible': False})]

# 성능 평가

In [29]:
# 성능 평가
accuracy.rmse(predictions)

RMSE: 0.8857


0.8857334257150036

# 최적 파라미터 탐색

In [30]:
# n_epochs: SGD 수행 시 반복 횟수, n_factors: 잠재 요인 크기
param_grid = {
    'n_epochs': [20, 40, 60], 
    'n_factors': [50, 100, 200]
}

# GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) # algo가 아닌 SVD 입력하였다.
gs.fit(surprise_data)

# 최적 하이퍼 파라미터 및 그 때의 최고 성능
print(gs.best_params['rmse'])
print(gs.best_score['rmse'])

{'n_epochs': 20, 'n_factors': 50}
0.8764481488956347


# 예상 평점 기반 영화 추천

In [31]:
# 아직 보지 않은 영화 리스트 함수
def get_unseen_surprise(ratings, movies, userId):
     # 특정 userId가 평점을 매긴 모든 영화 리스트
    seen_movies = ratings[ratings['userId']== userId]['movieId'].tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    total_movies = movies['movieId'].tolist()
      
    # 한줄 for + if문으로 안 본 영화 리스트 생성
    unseen_movies = [ movie for movie in total_movies if movie not in seen_movies]
    
    # 일부 정보 출력
    total_movie_cnt = len(total_movies)
    seen_cnt = len(seen_movies)
    unseen_cnt = len(unseen_movies)
    
    print(f"전체 영화 수: {total_movie_cnt}, 평점 매긴 영화 수: {seen_cnt}, 추천 대상 영화 수: {unseen_cnt}")
    
    return unseen_movies

In [32]:
# 예상 평점 기반 영화 리스트 추천 함수
def recomm_movie_by_surprise(algo, userId, unseen_movies, top_n=10):
    
    # 아직 보지 않은 영화의 예측 평점: prediction 객체 생성
    predictions = []    
    for movieId in unseen_movies:
        predictions.append(algo.predict(userId, movieId))
    
    # 리스트 내의 prediction 객체의 est를 기준으로 내림차순 정렬
    def sortkey_est(pred):
        return pred.est

    predictions.sort(key=sortkey_est, reverse=True) # key에 리스트 내 객체의 정렬 기준을 입력
    
    # 상위 top_n개의 prediction 객체
    top_predictions = predictions[:top_n]
    
    # 영화 아이디, 제목, 예측 평점 출력
    print(f"Top-{top_n} 추천 영화 리스트")
    
    for pred in top_predictions:
        
        movie_id = int(pred.iid)
        movie_title = movies[movies["movieId"] == movie_id]["title"].tolist()
        movie_rating = pred.est
        
        print(f"{movie_id} - {movie_title}: {movie_rating:.2f}")

In [33]:
unseen_movies = get_unseen_surprise(ratings, movies, 234)

전체 영화 수: 9742, 평점 매긴 영화 수: 202, 추천 대상 영화 수: 9540


In [34]:
recomm_movie_by_surprise(algo, 234, unseen_movies, top_n=20)

Top-20 추천 영화 리스트
441 - ['Dazed and Confused (1993)']: 4.71
3836 - ["Kelly's Heroes (1970)"]: 4.56
898 - ['Philadelphia Story, The (1940)']: 4.55
7153 - ['Lord of the Rings: The Return of the King, The (2003)']: 4.54
56782 - ['There Will Be Blood (2007)']: 4.52
2791 - ['Airplane! (1980)']: 4.51
608 - ['Fargo (1996)']: 4.48
1225 - ['Amadeus (1984)']: 4.48
527 - ["Schindler's List (1993)"]: 4.46
1193 - ["One Flew Over the Cuckoo's Nest (1975)"]: 4.46
1288 - ['This Is Spinal Tap (1984)']: 4.46
1704 - ['Good Will Hunting (1997)']: 4.45
318 - ['Shawshank Redemption, The (1994)']: 4.44
4973 - ["Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)"]: 4.44
1090 - ['Platoon (1986)']: 4.44
1204 - ['Lawrence of Arabia (1962)']: 4.44
2501 - ['October Sky (1999)']: 4.42
60069 - ['WALL·E (2008)']: 4.42
1206 - ['Clockwork Orange, A (1971)']: 4.42
2804 - ['Christmas Story, A (1983)']: 4.42
