# KNNBasic 알고리즘 기본 코드

## #01. 준비작업

### [1] 패키지 가져오기

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Intel SKlearn 하드웨어 가속 패치 설정
import sys
if sys.platform == 'win32':
    from sklearnex import patch_sklearn
    patch_sklearn()

from hossam.util import *
from hossam.plot import *
from hossam.analysis import *
from hossam.classification import *

from sklearn.linear_model import SGDClassifier


from surprise import Dataset,  Reader, KNNBasic
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from surprise.accuracy import rmse, mae

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
origin = my_read_excel('C:/Users/Jihwan/Desktop/01Class/E.추론통계,머신러닝/E.InferentialStatistics/수업자료/movie_ratings.xlsx',sheet_name='ratings')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None

데이터프레임 상위 5개 행
+----+----------+-----------+----------+-------------+
|    |   userId |   movieId |   rating |   timestamp |
|----+----------+-----------+----------+-------------|
|  0 |        1 |         1 |        4 | 9.64983e+08 |
|  1 |        1 |         3 |        4 | 9.64981e+08 |
|  2 |        1 |         6 |        4 | 9.64982e+08 |
|  3 |        1 |        47 |        5 | 9.64984e+08 |
|  4 |        1 |        50 |        5 | 9.64983e+08 |
+----+----------+-----------+----------+-------------+

데이터프레임 하위 5개 행
+--------+----------+-----------+----------+-------------+
|  

## #02. 데이터 전처리

### [1] Surprise 형식의 데이터로 변환

`사용자 번호, 아이템 번호, 평점` 구조의 데이터를 만족해야 한다.

In [4]:
df = origin.drop('timestamp', axis=1)

reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(df, reader)

## #03. 추천 모형 구현

### [1] 하이퍼파라미터 튜닝

In [5]:
params = {
    'bsl_options': {
        "method": ['als', 'sgd'],
        "n_epochs": [10, 20],
        "k": [30, 40, 50],
        "min_k": [1, 2, 3]
    },
    "sim_options": {
        # 코사인 유사도(msd), 피어슨 유사도(pearson), 피어슨-베이스라인 유사도
        "name": ['msd', 'pearson', 'pearson_baseline']
    }

}

grid = RandomizedSearchCV(KNNBasic, 
                          param_distributions=params,
                          measures=['RMSE', 'MAE'],
                          cv=5,
                          n_jobs=-1,
                          random_state=123)

grid.fit(data)                                           

### [2] 결과확인

#### (1) 하이퍼파라미터와 확인

In [6]:
grid.cv_results['params']

array([{'bsl_options': {'method': 'sgd', 'n_epochs': 20, 'k': 50, 'min_k': 1}, 'sim_options': {'name': 'pearson', 'user_based': True}},
       {'bsl_options': {'method': 'sgd', 'n_epochs': 10, 'k': 50, 'min_k': 2}, 'sim_options': {'name': 'pearson', 'user_based': True}},
       {'bsl_options': {'method': 'als', 'n_epochs': 10, 'k': 40, 'min_k': 3}, 'sim_options': {'name': 'pearson', 'user_based': True}},
       {'bsl_options': {'method': 'als', 'n_epochs': 20, 'k': 50, 'min_k': 2}, 'sim_options': {'name': 'msd', 'user_based': True}},
       {'bsl_options': {'method': 'sgd', 'n_epochs': 10, 'k': 50, 'min_k': 3}, 'sim_options': {'name': 'msd', 'user_based': True}},
       {'bsl_options': {'method': 'sgd', 'n_epochs': 10, 'k': 50, 'min_k': 2}, 'sim_options': {'name': 'pearson_baseline', 'user_based': True}},
       {'bsl_options': {'method': 'sgd', 'n_epochs': 20, 'k': 50, 'min_k': 2}, 'sim_options': {'name': 'pearson', 'user_based': True}},
       {'bsl_options': {'method': 'als', 'n_epo

#### (3) 성능평가지표 확인

In [18]:
grid.best_score

{'rmse': 0.9486791375212841, 'mae': 0.7268626433149616}

#### (4) 최적 하이퍼파라미터 확인

In [7]:
grid.best_params

{'rmse': {'bsl_options': {'method': 'als',
   'n_epochs': 20,
   'k': 50,
   'min_k': 2},
  'sim_options': {'name': 'msd', 'user_based': True}},
 'mae': {'bsl_options': {'method': 'als', 'n_epochs': 20, 'k': 50, 'min_k': 2},
  'sim_options': {'name': 'msd', 'user_based': True}}}

#### (5) 최적 추정기

In [8]:
grid.best_estimator

{'rmse': <surprise.prediction_algorithms.knns.KNNBasic at 0x20f4edec3d0>,
 'mae': <surprise.prediction_algorithms.knns.KNNBasic at 0x20f4edec410>}

#### (6) 최적 추정기를 활용한 컨텐츠 추천

학습을 다시 시켜야 한다.

In [9]:
train, test = train_test_split(data, test_size=0.2, random_state=1234)

In [16]:
estimator = grid.best_estimator['rmse']
estimator.fit(train)
pred = estimator.test(test)
pred[:]

Computing the msd similarity matrix...
Done computing similarity matrix.


[Prediction(uid=603, iid=3996, r_ui=5.0, est=4.104807871091812, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=199, iid=2912, r_ui=4.0, est=4.085720357369826, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid=416, iid=2716, r_ui=2.0, est=3.5240370370964937, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=589, iid=150, r_ui=4.0, est=3.9733299749945528, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=307, iid=6755, r_ui=4.0, est=3.144829027233701, details={'actual_k': 9, 'was_impossible': False}),
 Prediction(uid=514, iid=1326, r_ui=1.5, est=0.9277665687937735, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid=587, iid=11, r_ui=4.0, est=3.9583636282737125, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=177, iid=520, r_ui=4.0, est=2.9094829741939017, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=234, iid=438, r_ui=3.0, est=2.778236564994723, details={'

In [17]:
estimator.predict(uid=68, iid=434)

Prediction(uid=68, iid=434, r_ui=None, est=2.8634256803992257, details={'actual_k': 40, 'was_impossible': False})