# SVD 알고리즘 기본 코드

## #01. 준비작업

### [1] 패키지 가져오기

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Intel SKlearn 하드웨어 가속 패치 설정
import sys
if sys.platform == 'win32':
    from sklearnex import patch_sklearn
    patch_sklearn()

from hossam.util import *
from hossam.plot import *
from hossam.analysis import *
from hossam.classification import *

from sklearn.linear_model import SGDClassifier


from surprise import Dataset,  Reader, SVD
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from surprise.accuracy import rmse, mae

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
origin = my_read_excel('C:/Users/Jihwan/Desktop/01Class/E.추론통계,머신러닝/E.InferentialStatistics/수업자료/movie_ratings.xlsx',sheet_name='ratings')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None

데이터프레임 상위 5개 행
+----+----------+-----------+----------+-------------+
|    |   userId |   movieId |   rating |   timestamp |
|----+----------+-----------+----------+-------------|
|  0 |        1 |         1 |        4 | 9.64983e+08 |
|  1 |        1 |         3 |        4 | 9.64981e+08 |
|  2 |        1 |         6 |        4 | 9.64982e+08 |
|  3 |        1 |        47 |        5 | 9.64984e+08 |
|  4 |        1 |        50 |        5 | 9.64983e+08 |
+----+----------+-----------+----------+-------------+

데이터프레임 하위 5개 행
+--------+----------+-----------+----------+-------------+
|  

## #02. 데이터 전처리

### [1] Surprise 형식의 데이터로 변환

`사용자 번호, 아이템 번호, 평점` 구조의 데이터를 만족해야 한다.

In [3]:
df = origin.drop('timestamp', axis=1)

reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(df, reader)

## #03. 추천 모형 구현

### [1] 하이퍼파라미터 튜닝

In [4]:
params = {
        "n_epochs": [20, 40, 50],
        "n_factors": [100, 200, 300]

    }

grid = GridSearchCV(SVD, 
                          param_grid=params,
                          measures=['RMSE', 'MAE'],
                          cv=5,
                          n_jobs=-1,
                          )

grid.fit(data)                                           

### [2] 결과확인

#### (1) 하이퍼파라미터와 확인

In [5]:
grid.cv_results['params']

[{'n_epochs': 20, 'n_factors': 100},
 {'n_epochs': 20, 'n_factors': 200},
 {'n_epochs': 20, 'n_factors': 300},
 {'n_epochs': 40, 'n_factors': 100},
 {'n_epochs': 40, 'n_factors': 200},
 {'n_epochs': 40, 'n_factors': 300},
 {'n_epochs': 50, 'n_factors': 100},
 {'n_epochs': 50, 'n_factors': 200},
 {'n_epochs': 50, 'n_factors': 300}]

#### (3) 성능평가지표 확인

In [8]:
df = DataFrame(grid.cv_results['params'])
df['rmse'] = grid.best_score['rmse']
df['mae'] = grid.best_score['mae']
df.sort_values('rmse', ascending=False, inplace=True)
my_pretty_table(df)

+----+------------+-------------+----------+----------+
|    |   n_epochs |   n_factors |     rmse |      mae |
|----+------------+-------------+----------+----------|
|  0 |         20 |         100 | 0.875099 | 0.671894 |
|  1 |         20 |         200 | 0.875099 | 0.671894 |
|  2 |         20 |         300 | 0.875099 | 0.671894 |
|  3 |         40 |         100 | 0.875099 | 0.671894 |
|  4 |         40 |         200 | 0.875099 | 0.671894 |
|  5 |         40 |         300 | 0.875099 | 0.671894 |
|  6 |         50 |         100 | 0.875099 | 0.671894 |
|  7 |         50 |         200 | 0.875099 | 0.671894 |
|  8 |         50 |         300 | 0.875099 | 0.671894 |
+----+------------+-------------+----------+----------+


#### (4) 최적의 하이퍼파라미터 확인

In [10]:
grid.best_params

{'rmse': {'n_epochs': 20, 'n_factors': 100},
 'mae': {'n_epochs': 20, 'n_factors': 100}}

#### (5) 최적 추정기

In [11]:
grid.best_estimator

{'rmse': <surprise.prediction_algorithms.matrix_factorization.SVD at 0x14ef6177ed0>,
 'mae': <surprise.prediction_algorithms.matrix_factorization.SVD at 0x14ef6177ad0>}

#### (6) 최적 추정기를 활용한 컨텐츠 추천

학습을 다시 시켜야 한다.

In [12]:
train, test = train_test_split(data, test_size=0.2, random_state=1234)

In [13]:
estimator = grid.best_estimator['rmse']
estimator.fit(train)
pred = estimator.test(test)
pred[:5]

[Prediction(uid=603, iid=3996, r_ui=5.0, est=3.439236152725872, details={'was_impossible': False}),
 Prediction(uid=199, iid=2912, r_ui=4.0, est=3.5170501422974993, details={'was_impossible': False}),
 Prediction(uid=416, iid=2716, r_ui=2.0, est=3.0815827230864317, details={'was_impossible': False}),
 Prediction(uid=589, iid=150, r_ui=4.0, est=4.311212149203156, details={'was_impossible': False}),
 Prediction(uid=307, iid=6755, r_ui=4.0, est=2.825910237186873, details={'was_impossible': False})]

In [14]:
estimator.predict(uid=68, iid=434)

Prediction(uid=68, iid=434, r_ui=None, est=2.5535525980117835, details={'was_impossible': False})