# SVD 활용

## #01. 준비작업

### [1] 패키지 가져오기

In [113]:
import warnings
warnings.filterwarnings('ignore')

# Intel SKlearn 하드웨어 가속 패치 설정
import sys
if sys.platform == 'win32':
    from sklearnex import patch_sklearn
    patch_sklearn()

from helper.util import *
from helper.plot import *
from helper.analysis import *

from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from surprise.accuracy import rmse, mae

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


### [2] 데이터 가져오기

In [114]:
origin = my_read_excel("https://data.hossam.kr/mldata/movie_ratings.xlsx", 
                    sheet_name='ratings', info=False)

In [115]:
movies = my_read_excel("https://data.hossam.kr/mldata/movie_ratings.xlsx", 
                    sheet_name='movies')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None

데이터프레임 상위 5개 행
+----+-----------+------------------------------------+---------------------------------------------+
|    |   movieId | title                              | genres                                      |
|----+-----------+------------------------------------+---------------------------------------------|
|  0 |         1 | Toy Story (1995)                   | Adventure|Animation|Children|Comedy|Fantasy |
|  1 |         2 | Jumanji (1995)                     | Adventure|Children|Fantasy                  |
|  2 |         3 | Grumpier Old Men (1995)            | Comedy|Romance                              |
|  3 |         4 | Waiti

## #02. 데이터 전처리

### [1] Surprise 형식의 데이터로 변환

`사용자 번호, 아이템 번호, 평점` 구조의 데이터를 만족해야 한다.

In [116]:
df = origin.drop('timestamp', axis=1)

# 평점의 분포를 알려준다.
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(df, reader)
data

<surprise.dataset.DatasetAutoFolds at 0x25b6c3e5b50>

## #03. 추천 모형 구현

### [1] 최적 하이퍼 파라미터 찾기

In [117]:
params = {
    "n_epochs": [20, 40, 50],          # 반복 횟수(기본값=20)
    "n_factors": [100, 200, 300],      # 요인의 수(기본값=100)
    "random_state" : [123]
}

# RandomizedSearchCV가 에러가 있는 듯 (아니면 파라미터 설정 방법이 다르거나...)
grid = GridSearchCV(SVD, 
                    param_grid=params, 
                    measures=['RMSE'], 
                    cv=5, 
                    n_jobs=-1)

grid.fit(data)

estimator = grid.best_estimator['rmse']
estimator

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25b6ffe7a10>

### [2] 다시 학습 시킴

In [118]:
train = data.build_full_trainset()
estimator.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25b6ffe7a10>

## #04. 특정 사용자에 대한 추천 영화 목록 만들기

### [1] 장르 세트 구하기

In [119]:
genres_set = set()
for i in movies['genres'].apply(lambda x: x.split('|')):
    for j in i:
        genres_set.add(j)
genres_set

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

### [2] 특정 사용자와 장르 지정하기

In [120]:
select_id = 9
select_genres = ['Adventure','Fantasy']

### [3] 해당 사용자가 본 영화 목록

In [121]:

seen_movies = origin[origin['userId'] == select_id]['movieId'].tolist()
print(seen_movies)
print("9번 사용자가 본 영화 수:", len(seen_movies))

[41, 187, 223, 371, 627, 922, 923, 1037, 1095, 1198, 1270, 1674, 1987, 2011, 2012, 2023, 2300, 2877, 2901, 3173, 3328, 3735, 4131, 4558, 4993, 5218, 5378, 5445, 5447, 5451, 5481, 5507, 5841, 5843, 5872, 5890, 5891, 5893, 5902, 5952, 5956, 5962, 5965, 5988, 6001, 6044]
9번 사용자가 본 영화 수: 46


### [4] 해당 사용자가 보지 않은 영화 목록 (추천대상)

In [122]:
unseen_movies = movies[~movies['movieId'].isin(seen_movies)] # ~ 을붙이면 부정
unseen_movies['genres'] = unseen_movies['genres'].apply(lambda x: x.split('|'))
unseen_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]"
9738,193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]"
9739,193585,Flint (2017),[Drama]
9740,193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]"


### [5] 예상평점 구하기

In [123]:
unseen_movies['ratings'] = unseen_movies['movieId'].apply(lambda x : estimator.predict(uid=9, iid=x).est)
unseen_movies.sort_values('ratings', ascending=False, inplace=True)
unseen_movies

Unnamed: 0,movieId,title,genres,ratings
1734,2329,American History X (1998),"[Crime, Drama]",4.274520
863,1136,Monty Python and the Holy Grail (1975),"[Adventure, Comedy, Fantasy]",4.237088
916,1215,Army of Darkness (1993),"[Action, Adventure, Comedy, Fantasy, Horror]",4.231220
899,1197,"Princess Bride, The (1987)","[Action, Adventure, Comedy, Fantasy, Romance]",4.226809
510,593,"Silence of the Lambs, The (1991)","[Crime, Horror, Thriller]",4.218979
...,...,...,...,...
2042,2720,Inspector Gadget (1999),"[Action, Adventure, Children, Comedy]",1.903348
5270,8666,Catwoman (2004),"[Action, Crime, Fantasy]",1.850484
1743,2338,I Still Know What You Did Last Summer (1998),"[Horror, Mystery, Thriller]",1.850384
1173,1556,Speed 2: Cruise Control (1997),"[Action, Romance, Thriller]",1.747354


### [6] 원하는 장르로 10개 선정하기

In [124]:
check = []
for num,i in enumerate(unseen_movies['genres']):
    for j in select_genres:
        if j in i: 
            check.append(num)
            break
    if len(check) ==10: break
check

[1, 2, 3, 5, 8, 13, 23, 24, 29, 30]

### [7] 결과확인

In [125]:
unseen_movies.iloc[check]

Unnamed: 0,movieId,title,genres,ratings
863,1136,Monty Python and the Holy Grail (1975),"[Adventure, Comedy, Fantasy]",4.237088
916,1215,Army of Darkness (1993),"[Action, Adventure, Comedy, Fantasy, Horror]",4.23122
899,1197,"Princess Bride, The (1987)","[Action, Adventure, Comedy, Fantasy, Romance]",4.226809
224,260,Star Wars: Episode IV - A New Hope (1977),"[Action, Adventure, Sci-Fi]",4.218799
924,1223,"Grand Day Out with Wallace and Gromit, A (1989)","[Adventure, Animation, Children, Comedy, Sci-Fi]",4.206999
898,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]",4.175422
903,1201,"Good, the Bad and the Ugly, The (Buono, il bru...","[Action, Adventure, Western]",4.098012
4800,7153,"Lord of the Rings: The Return of the King, The...","[Action, Adventure, Drama, Fantasy]",4.078465
3194,4306,Shrek (2001),"[Adventure, Animation, Children, Comedy, Fanta...",4.056458
964,1265,Groundhog Day (1993),"[Comedy, Fantasy, Romance]",4.054793
