# Baseline 알고리즘 기본 코드

## #01. 준비작업

### [1] 패키지 가져오기

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Intel SKlearn 하드웨어 가속 패치 설정
import sys
if sys.platform == 'win32':
    from sklearnex import patch_sklearn
    patch_sklearn()

from hossam.util import *
from hossam.plot import *
from hossam.analysis import *
from hossam.classification import *

from sklearn.linear_model import SGDClassifier


from surprise import Dataset, BaselineOnly, Reader, KNNBasic
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from surprise.accuracy import rmse, mae

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
origin = my_read_excel('C:/Users/Jihwan/Desktop/01Class/E.추론통계,머신러닝/E.InferentialStatistics/수업자료/movie_ratings.xlsx',sheet_name='ratings')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None

데이터프레임 상위 5개 행
+----+----------+-----------+----------+-------------+
|    |   userId |   movieId |   rating |   timestamp |
|----+----------+-----------+----------+-------------|
|  0 |        1 |         1 |        4 | 9.64983e+08 |
|  1 |        1 |         3 |        4 | 9.64981e+08 |
|  2 |        1 |         6 |        4 | 9.64982e+08 |
|  3 |        1 |        47 |        5 | 9.64984e+08 |
|  4 |        1 |        50 |        5 | 9.64983e+08 |
+----+----------+-----------+----------+-------------+

데이터프레임 하위 5개 행
+--------+----------+-----------+----------+-------------+
|  

## #02. 데이터 전처리

### [1] Surprise 형식의 데이터로 변환

`사용자 번호, 아이템 번호, 평점` 구조의 데이터를 만족해야 한다.

In [3]:
df = origin.drop('timestamp', axis=1)

reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(df, reader)

## #03. 추천 모형 구현

### [1] 기본 코드

#### (1) 훈련, 검증 데이터 분리

sklearn이 아닌 surprise 자체 함수 사용

In [4]:
train, test = train_test_split(data, test_size=0.2, random_state=1234)

#### (2) 추천 모형 학습 후 성능 평가

In [5]:
estimator = BaselineOnly()
estimator.fit(train)
pred = estimator.test(test)
pred[:]

Estimating biases using als...


[Prediction(uid=603, iid=3996, r_ui=5.0, est=3.865470094018238, details={'was_impossible': False}),
 Prediction(uid=199, iid=2912, r_ui=4.0, est=3.5270186068257785, details={'was_impossible': False}),
 Prediction(uid=416, iid=2716, r_ui=2.0, est=3.2531312891488335, details={'was_impossible': False}),
 Prediction(uid=589, iid=150, r_ui=4.0, est=4.143871102075767, details={'was_impossible': False}),
 Prediction(uid=307, iid=6755, r_ui=4.0, est=2.6344308636371943, details={'was_impossible': False}),
 Prediction(uid=514, iid=1326, r_ui=1.5, est=2.6953946830811386, details={'was_impossible': False}),
 Prediction(uid=587, iid=11, r_ui=4.0, est=3.998515320714829, details={'was_impossible': False}),
 Prediction(uid=177, iid=520, r_ui=4.0, est=2.958085745905209, details={'was_impossible': False}),
 Prediction(uid=234, iid=438, r_ui=3.0, est=3.5463388099721755, details={'was_impossible': False}),
 Prediction(uid=610, iid=7090, r_ui=3.0, est=4.130115679218677, details={'was_impossible': False}),


> uid: 사용자 번호, iid: 아이템 번호, r_ui: 해당 사용자가 실제로 부여한 평점, est: 예측평점

#### (3) 특정 유저가 특정 영화에 부여할 평점 예상

In [16]:
upred = estimator.predict(uid=68, iid=434)
upred

Prediction(uid=68, iid=434, r_ui=None, est=2.8605339615297294, details={'was_impossible': False})

In [7]:
upred.est

3.3150060582304923

#### (4) 성능평가

In [8]:
rmse(pred), mae(pred)

RMSE: 0.8715
MAE:  0.6706


(0.8715309792778995, 0.6706040327595953)

### [2] 교차검증

#### (1) 교차검증을 위한 하이퍼파라미터 설정

In [9]:
estimator = BaselineOnly(bsl_options={
    "method": "als", 
    "n_epochs": 10, 
    "reg_u": 10, 
    "reg_i": 15,
})
cv_result = cross_validate(estimator, data, measures = ['RMSE', 'MAE'], cv=5, verbose=True)
    

Estimating biases using als...


Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8810  0.8744  0.8832  0.8766  0.8692  0.8769  0.0049  
MAE (testset)     0.6814  0.6738  0.6830  0.6744  0.6712  0.6768  0.0046  
Fit time          0.22    0.24    0.24    0.24    0.25    0.24    0.01    
Test time         0.04    0.05    0.14    0.05    0.05    0.06    0.04    


#### (2) 교차검증 결과 확인

In [10]:
cv_result

{'test_rmse': array([0.88098856, 0.87438807, 0.88319261, 0.87659922, 0.86921872]),
 'test_mae': array([0.68139527, 0.67377822, 0.68299949, 0.67436852, 0.67122956]),
 'fit_time': (0.21834540367126465,
  0.24388551712036133,
  0.2374560832977295,
  0.2379450798034668,
  0.25442934036254883),
 'test_time': (0.03521442413330078,
  0.04800605773925781,
  0.14065003395080566,
  0.048418521881103516,
  0.04778575897216797)}

#### (3) 교차검증 성능 평가 지표 출력

In [11]:
print("RMSE(mean):", cv_result['test_rmse'].mean())
print("MAE(mean):", cv_result['test_mae'].mean())

RMSE(mean): 0.876877435120134
MAE(mean): 0.6767542128117379


### [3] 하이퍼파라미터 튜닝

#### (1) 학습 모형 구성

In [12]:
params = {
    'bsl_options': {
        "method": ['als', 'sgd'],
        "n_epochs": [10, 20],
        "reg_u": [10, 12],
        "reg_i": [15, 20]
    }
}

grid = RandomizedSearchCV(BaselineOnly, 
                          param_distributions=params,
                          measures=['RMSE', 'MAE'],
                          cv=5,
                          n_jobs=-1,
                          random_state=123)

grid.fit(data)                                           

#### (2) 성능 평가 지표 확인

In [13]:
grid.best_score

{'rmse': 0.869653973011921, 'mae': 0.6686546523743646}

#### (3) 최적 하이퍼파라미터 확인

In [14]:
grid.best_params

{'rmse': {'bsl_options': {'method': 'sgd',
   'n_epochs': 20,
   'reg_u': 12,
   'reg_i': 15}},
 'mae': {'bsl_options': {'method': 'sgd',
   'n_epochs': 20,
   'reg_u': 12,
   'reg_i': 15}}}

#### (4) 최적 추정기

In [15]:
grid.best_estimator

{'rmse': <surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x25f939a7bd0>,
 'mae': <surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x25f939a7890>}