# SVD 활용

## #01. 준비작업

### [1] 패키지 가져오기

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Intel SKlearn 하드웨어 가속 패치 설정
import sys
if sys.platform == 'win32':
    from sklearnex import patch_sklearn
    patch_sklearn()

from hossam.util import *
from hossam.plot import *
from hossam.analysis import *

from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from surprise.accuracy import rmse, mae

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


### [2] 데이터 가져오기

In [6]:
origin = my_read_excel("https://data.hossam.kr/mldata/movie_ratings.xlsx", 
                    sheet_name='ratings', info=False)


In [7]:
movies = my_read_excel("https://data.hossam.kr/mldata/movie_ratings.xlsx", 
                    sheet_name='movies')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None

데이터프레임 상위 5개 행
+----+-----------+------------------------------------+---------------------------------------------+
|    |   movieId | title                              | genres                                      |
|----+-----------+------------------------------------+---------------------------------------------|
|  0 |         1 | Toy Story (1995)                   | Adventure|Animation|Children|Comedy|Fantasy |
|  1 |         2 | Jumanji (1995)                     | Adventure|Children|Fantasy                  |
|  2 |         3 | Grumpier Old Men (1995)            | Comedy|Romance                              |
|  3 |         4 | Waiti

## #02. 데이터 전처리

### [1] Surprise 형식의 데이터로 변환

`사용자 번호, 아이템 번호, 평점` 구조의 데이터를 만족해야 한다.

In [8]:
df = origin.drop('timestamp', axis=1)

# 평점의 분포를 알려준다.
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(df, reader)
data

<surprise.dataset.DatasetAutoFolds at 0x1f3259c4c10>

## #03. 추천 모형 구현

### [1] 최적 하이퍼 파라미터 찾기

In [9]:
params = {
    "n_epochs": [20, 40, 50],          # 반복 횟수(기본값=20)
    "n_factors": [100, 200, 300],      # 요인의 수(기본값=100)
}

# RandomizedSearchCV가 에러가 있는 듯 (아니면 파라미터 설정 방법이 다르거나...)
grid = GridSearchCV(SVD, 
                    param_grid=params, 
                    measures=['RMSE'], 
                    cv=5, 
                    n_jobs=-1)

grid.fit(data)

estimator = grid.best_estimator['rmse']
estimator

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f32e7ab8d0>

### [2] 다시 학습 시킴

In [10]:
train = data.build_full_trainset()
estimator.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f32e7ab8d0>

## #04. 특정 사용자에 대한 추천 영화 목록 만들기

### [1] `9`번 사용자가 본 영화 목록

In [11]:
seen_movies = origin[origin['userId'] == 9]['movieId'].tolist()
print(seen_movies)
print('9번 사용자가 본 영화 수:', len(seen_movies))
                                        

[41, 187, 223, 371, 627, 922, 923, 1037, 1095, 1198, 1270, 1674, 1987, 2011, 2012, 2023, 2300, 2877, 2901, 3173, 3328, 3735, 4131, 4558, 4993, 5218, 5378, 5445, 5447, 5451, 5481, 5507, 5841, 5843, 5872, 5890, 5891, 5893, 5902, 5952, 5956, 5962, 5965, 5988, 6001, 6044]
9번 사용자가 본 영화 수: 46


### [2] 9번 사용자가 보지 않은 영화 목록 (추천대상)

In [12]:
unseen_movies = movies[~movies['movieId'].isin(seen_movies)]
unseen_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


### [3] 예상평점 구하기

In [16]:
unseen_movies['ratings'] = unseen_movies['movieId'].apply(lambda x: estimator.predict(uid=9, iid=x).est)
unseen_movies

Unnamed: 0,movieId,title,genres,ratings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.903178
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.359101
2,3,Grumpier Old Men (1995),Comedy|Romance,3.007464
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.653063
4,5,Father of the Bride Part II (1995),Comedy,2.520343
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,3.094958
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.006630
9739,193585,Flint (2017),Drama,3.186471
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.281308


### [4] 상위 10개 선정

In [17]:
unseen_movies.sort_values('ratings',ascending=False).head(10) 

Unnamed: 0,movieId,title,genres,ratings
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,4.331469
898,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,4.262443
899,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,4.260533
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.254313
828,1089,Reservoir Dogs (1992),Crime|Mystery|Thriller,4.197305
680,898,"Philadelphia Story, The (1940)",Comedy|Drama|Romance,4.197053
277,318,"Shawshank Redemption, The (1994)",Crime|Drama,4.193412
933,1233,"Boot, Das (Boat, The) (1981)",Action|Drama|War,4.167237
6710,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,4.165874
3141,4226,Memento (2000),Mystery|Thriller,4.160166
