## 유사도가 높은 K 사용자(KNN)의 평점을 이용한 협업 필터링

- 💬 REMIND
    - Memory-based : Matrix를 이용하는 추천시스템 중 사용자의 평점 혹은 사용 여부를 바탕으로 구매 패턴을 파악해 그 기억을 바탕으로 추천을 진행하는 방법

- KNN(K- Nearest Neighbors) : K명의 최근접 이웃에 기반해서 찾는 방법, 사용자가 준 평점으로 유사한 사람의 아이템을 찾거나, 유사한 아이템을 찾아 추천을 한다.
    - 편향을 제거(전반적으로 평점을 후하게 주거나 적게 주는 경우를 방지) 해주기 위해 비교군의 평점을 더해주거나 빼주어 동일하게 해준다.
    - 방법이 간단하고 직관적이어서 접근이 용이
    - 유저 기반의 방법 및 속도, 메모리가 많이 든다.
    - 희소성으로 인한 제약이 발생한다. (유사한 이웃이 사용한 경험이 없으면 추천 불가능하다)


In [100]:
import numpy as np
import pandas as pd

### Data Load

In [101]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('/Users/jun/Library/Mobile Documents/com~apple~CloudDocs/Github/ai _recommendation _system/data/u.user', sep='|', names=u_cols, encoding='latin-1')

i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('/Users/jun/Library/Mobile Documents/com~apple~CloudDocs/Github/ai _recommendation _system/data/u.item', sep='|', names=i_cols, encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/Users/jun/Library/Mobile Documents/com~apple~CloudDocs/Github/ai _recommendation _system/data/u.data', sep='\t', names=r_cols, encoding='latin-1')

In [102]:
# rating df, timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)

In [103]:
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

### RMSE
- $RMSE = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y_i})^2}$
- 예측 모델의 성능을 평가하는 지표 중 하나 / 예측 오차의 크기를 제곱한 후 평균을 내고, 다시 제곱근을 취한 값

In [104]:
# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

### 모델별로 테스트 데이터의 예측 및 실데이터 간의 정확도 계산

In [105]:
# 모델별 RMSE를 계산하는 함수
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

### train, test set 분리

In [106]:
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state= 42, stratify=y)

### train 데이터 (사용자 X 영화 X 평점) Full matrix

In [107]:
# train 데이터로 Full matrix 구하기 
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


### train set 사용자간의 유사도 (cosie similarity) 계산
- 평가를 진행하지 않은 영화는 NaN값으로 되어있다.

In [108]:
from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = rating_matrix.copy().fillna(0) # 평가를 진행하지 않은 영화 -> 0으로 수정
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.108361,0.046638,0.029577,0.245753,0.335853,0.344724,0.191582,0.057149,0.251979,...,0.257073,0.069412,0.231643,0.108093,0.176842,0.104799,0.232472,0.051528,0.129555,0.256333
2,0.108361,1.000000,0.057613,0.130237,0.054918,0.190552,0.079399,0.076146,0.167992,0.147376,...,0.136993,0.252887,0.255454,0.285193,0.232751,0.149088,0.102807,0.062386,0.109143,0.107686
3,0.046638,0.057613,1.000000,0.139805,0.000000,0.032485,0.043869,0.080968,0.022263,0.059925,...,0.027402,0.000000,0.175060,0.010343,0.105635,0.019052,0.127099,0.023917,0.060392,0.000000
4,0.029577,0.130237,0.139805,1.000000,0.000000,0.045190,0.088586,0.199526,0.135013,0.026919,...,0.055392,0.049773,0.076549,0.139382,0.113886,0.000000,0.130343,0.077357,0.157890,0.063911
5,0.245753,0.054918,0.000000,0.000000,1.000000,0.176443,0.281860,0.132205,0.038790,0.134200,...,0.183969,0.019305,0.073714,0.041807,0.081088,0.029743,0.188392,0.068342,0.055557,0.207259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.104799,0.149088,0.019052,0.000000,0.029743,0.086464,0.075012,0.095736,0.000000,0.080883,...,0.061061,0.299811,0.158064,0.221251,0.323989,1.000000,0.047368,0.162173,0.058828,0.124548
940,0.232472,0.102807,0.127099,0.130343,0.188392,0.230566,0.270071,0.164157,0.131458,0.255758,...,0.195863,0.113346,0.144570,0.173568,0.139877,0.047368,1.000000,0.092911,0.199881,0.135868
941,0.051528,0.062386,0.023917,0.077357,0.068342,0.095478,0.020036,0.076269,0.106763,0.063461,...,0.021901,0.055348,0.226017,0.170493,0.249612,0.162173,0.092911,1.000000,0.072402,0.099200
942,0.129555,0.109143,0.060392,0.157890,0.055557,0.197307,0.236086,0.089871,0.089297,0.169309,...,0.111291,0.078263,0.051882,0.137759,0.069516,0.058828,0.199881,0.072402,1.000000,0.142812


### train 데이터의 user의 rating 평균과 영화의 평점편차 계산
- `rating_mean`: 각 사용자의 영화 평점의 평균
- `rating_matrix.T` : 사용자들이 영화들에 대해 평가한 정보를 열 기준으로 바꾸는 것
- `rating_matrix.T - rating_mean` : 각 사용자가 영화에 부여한 평점에서 해당 사용자의 평균 평점을 빼는 작업
    - 결과적으로 사용자별로 편차가 계산된 상태

##### 왜 해당 작업을 하는가?
- 협업 필터링에서는 사용자의 **개인적인 편향**을 고려하는 것이 중요
- 어떤 사용자는 대부분의 영화에 대해 높은 평점을 부여하고, 다른 사용자는 더 낮은 평점을 주는 경향이 있을 수 있다.
- 사용자 간의 평점이 일관되지 않으면 단순히 평점만을 이용해 유사도를 계산하거나 추천을 수행하는 것은 부정확한 결과를 초래한다.
    - 이를 해결하기 위해, 각 사용자의 영화 평점에서 **개인 평균을 빼서 편차를 계산**하고, 이를 기반으로 추천을 수행 -> 사용자 간의 차이를 보정할 수 있으며, 사용자 간 유사도 계산이 더 정확해지고 추천 결과의 품질이 향상
    - 따라서 `rating_bias`는 평점의 절대적인 값이 아니라, 사용자 평균 대비 상대적인 편차를 나타낸다.

In [109]:
# train 데이터의 user의 rating 평균과 영화의 평점편차 계산 
rating_mean = rating_matrix.mean(axis=1)   # <사용자별 영화평점 평균>, axis=1이면 column에 행의 평균
rating_bias = (rating_matrix.T - rating_mean).T   # 각 영화 사용자 평점 - <사용자별 영화평점 평균>

In [110]:
rating_bias

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.401961,-0.598039,0.401961,,-0.598039,1.401961,0.401961,-2.598039,1.401961,-0.598039,...,,,,,,,,,,
2,0.212766,,,,,,,,,-1.787234,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.190840,0.190840,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,-1.45,,,0.550000,1.550000,-0.450000,,...,,,,,,,,,,
941,,,,,,,0.058824,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


### 주어진 영화의 (movie_id) 가중 평균 rating을 계산하는 함수

In [111]:
def CF_knn_bias(user_id, movie_id, neighbor_size=0): # 특정 유저, 특정 유저와 유사한 사람
    if movie_id in rating_bias:
        # 현 user와 다른 사용자 간의 유사도 가져오기
        sim_scores = user_similarity[user_id].copy() 
        # 현 movie의 평점편차 가져오기
        movie_ratings = rating_bias[movie_id].copy()
        # 현 movie에 대한 rating이 없는 사용자 삭제
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
##### (2) Neighbor size가 지정되지 않은 경우        
        if neighbor_size == 0:
            if sim_scores.sum() == 0:    # user_id와 유사도가 0인 경우 있음.
                prediction = rating_mean[user_id]  # 사용자별 영화평점 평균
            else:
                # 편차로 예측값(편차 예측값) 계산
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                # 편차 예측값에 현 사용자의 평균 더하기
                prediction = prediction + rating_mean[user_id]
                
##### (3) Neighbor size가 지정된 경우            
        else:
            # 해당 영화를 평가한 사용자가 최소 2명이 되는 경우에만 계산            
            if len(sim_scores) > 1:
                # 지정된 neighbor size 값과 해당 영화를 평가한 총사용자 수 중 작은 것으로 결정
                neighbor_size = min(neighbor_size, len(sim_scores))
                # array로 바꾸기 (argsort를 사용하기 위함)
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                # 유사도를 순서대로 정렬
                user_idx = np.argsort(sim_scores)
                # 유사도와 rating을 neighbor size만큼 받기
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                if sim_scores.sum() == 0:    # user_id와 유사도가 0인 경우 있음.
                    prediction = rating_mean[user_id]
                else:
                    # 편차로 예측치 계산
                    prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                    # 예측값에 현 사용자의 평균 더하기
                    prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    return prediction

In [112]:
print(score(CF_knn_bias))

0.957674455114373


In [113]:
print(score(CF_knn_bias, neighbor_size=30))

0.9485429509551335


### 전체 데이터에서 추천

In [80]:
rating_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')

from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [81]:
def recommender(user, n_items=10, neighbor_size=20):
    # 현재 사용자의 모든 아이템에 대한 예상 평점 계산
    predictions = []
    rated_index = rating_matrix.loc[user][rating_matrix.loc[user] > 0].index    # 이미 평가한 영화 확인
    items = rating_matrix.loc[user].drop(rated_index)
    for item in items.index:
        predictions.append(CF_knn_bias(user, item, neighbor_size))                   # 예상평점 계산
    recommendations = pd.Series(data=predictions, index=items.index, dtype=float)
    recommendations = recommendations.sort_values(ascending=False)[:n_items]    # 예상평점이 가장 높은 영화 선택
    recommended_items = movies.loc[recommendations.index]['title']
    return recommended_items

In [82]:
recommender(user=2, n_items=5, neighbor_size=30)

movie_id
851             Bloody Child, The (1996)
1512                       Sprung (1997)
1467                    Cure, The (1995)
1591              Magic Hour, The (1998)
1293    Ayn Rand: A Sense of Life (1997)
Name: title, dtype: object

---
### 학습데이터의 전체 사용자간의 유사도를 상관계수로 계산

In [26]:
# user_similarity = rating_matrix.T.corr(method='pearson')
# user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,1.608412e-01,0.112780,0.500000,0.420809,0.287159,0.258137,0.692086,-0.102062,-0.092344,...,0.061695,-2.602417e-01,0.383733,2.899974e-02,0.326744,5.343904e-01,0.263289,0.205616,-0.180784,0.067549
2,0.160841,1.000000e+00,0.067420,0.148522,0.327327,0.446269,0.643675,0.585491,0.242536,0.668145,...,0.021007,-2.711631e-01,0.214017,5.616449e-01,0.331587,-6.694897e-18,-0.011682,-0.062017,0.085960,0.479702
3,0.112780,6.741999e-02,1.000000,-0.262600,,-0.109109,0.064803,0.291937,,0.311086,...,,,-0.045162,-5.233642e-17,-0.137523,,-0.104678,1.000000,-0.011792,
4,0.500000,1.485221e-01,-0.262600,1.000000,1.000000,-0.581318,-0.266632,0.642938,,-0.301511,...,0.500000,,-0.203653,,0.375000,,0.850992,1.000000,0.412568,
5,0.420809,3.273268e-01,,1.000000,1.000000,0.241817,0.175630,0.537400,0.577350,0.087343,...,0.229532,-5.000000e-01,0.439286,6.085806e-01,0.484211,8.807048e-01,0.027038,0.468521,0.318163,0.346234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.534390,-6.694897e-18,,,0.880705,0.206315,0.142404,-0.333333,,0.316228,...,0.374351,-3.305898e-02,0.471172,-2.758386e-01,-0.073374,1.000000e+00,-0.534522,-0.131306,-0.500000,-0.187317
940,0.263289,-1.168173e-02,-0.104678,0.850992,0.027038,-0.024419,0.000931,0.320487,0.171499,0.158976,...,-0.125059,4.352858e-01,-0.338327,-1.486075e-01,0.110022,-5.345225e-01,1.000000,0.632746,-0.022813,0.332497
941,0.205616,-6.201737e-02,1.000000,1.000000,0.468521,0.399186,0.000000,0.166667,1.000000,0.420084,...,-0.500000,-2.355139e-16,0.273060,3.929526e-01,-0.214147,-1.313064e-01,0.632746,1.000000,-0.577350,-0.395285
942,-0.180784,8.596024e-02,-0.011792,0.412568,0.318163,0.092349,0.452075,0.201328,0.707107,0.408994,...,0.438252,-8.703883e-01,-0.216119,4.472136e-01,0.244989,-5.000000e-01,-0.022813,-0.577350,1.000000,0.277433


- 위 내용처럼 처음에 상관계수 행렬을 구했을때 NaN값이 존재해서 train 데이터에 대한 상관계수 유사도를 구했을 때 score가 NaN으로 나왔다.
- 이를 해결하기 위해 아래처럼 `.fillna(0)을 추가해줌으로써 NaN값을 0으로 채워주었다.

In [83]:
user_similarity = rating_matrix.T.corr(method='pearson').fillna(0)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,1.608412e-01,0.112780,0.500000,0.420809,0.287159,0.258137,0.692086,-0.102062,-0.092344,...,0.061695,-2.602417e-01,0.383733,2.899974e-02,0.326744,5.343904e-01,0.263289,0.205616,-0.180784,0.067549
2,0.160841,1.000000e+00,0.067420,0.148522,0.327327,0.446269,0.643675,0.585491,0.242536,0.668145,...,0.021007,-2.711631e-01,0.214017,5.616449e-01,0.331587,-6.694897e-18,-0.011682,-0.062017,0.085960,0.479702
3,0.112780,6.741999e-02,1.000000,-0.262600,0.000000,-0.109109,0.064803,0.291937,0.000000,0.311086,...,0.000000,0.000000e+00,-0.045162,-5.233642e-17,-0.137523,0.000000e+00,-0.104678,1.000000,-0.011792,0.000000
4,0.500000,1.485221e-01,-0.262600,1.000000,1.000000,-0.581318,-0.266632,0.642938,0.000000,-0.301511,...,0.500000,0.000000e+00,-0.203653,0.000000e+00,0.375000,0.000000e+00,0.850992,1.000000,0.412568,0.000000
5,0.420809,3.273268e-01,0.000000,1.000000,1.000000,0.241817,0.175630,0.537400,0.577350,0.087343,...,0.229532,-5.000000e-01,0.439286,6.085806e-01,0.484211,8.807048e-01,0.027038,0.468521,0.318163,0.346234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.534390,-6.694897e-18,0.000000,0.000000,0.880705,0.206315,0.142404,-0.333333,0.000000,0.316228,...,0.374351,-3.305898e-02,0.471172,-2.758386e-01,-0.073374,1.000000e+00,-0.534522,-0.131306,-0.500000,-0.187317
940,0.263289,-1.168173e-02,-0.104678,0.850992,0.027038,-0.024419,0.000931,0.320487,0.171499,0.158976,...,-0.125059,4.352858e-01,-0.338327,-1.486075e-01,0.110022,-5.345225e-01,1.000000,0.632746,-0.022813,0.332497
941,0.205616,-6.201737e-02,1.000000,1.000000,0.468521,0.399186,0.000000,0.166667,1.000000,0.420084,...,-0.500000,-2.355139e-16,0.273060,3.929526e-01,-0.214147,-1.313064e-01,0.632746,1.000000,-0.577350,-0.395285
942,-0.180784,8.596024e-02,-0.011792,0.412568,0.318163,0.092349,0.452075,0.201328,0.707107,0.408994,...,0.438252,-8.703883e-01,-0.216119,4.472136e-01,0.244989,-5.000000e-01,-0.022813,-0.577350,1.000000,0.277433


- 사용자별 평가점수평균을 구해야한다.
### 각 user별 평점 평균
- 데이터프레임 transpose 시킨 과정을 상세히 살펴볼 필요가 있다.

In [84]:
# train 데이터의 user의 rating 평균과 영화의 평점편차 계산 
rating_mean = rating_matrix.mean(axis=1)   # 사용자별 영화평점 평균, axis=1이면 column에 행의 평균
rating_bias = (rating_matrix.T - rating_mean).T   # 각 영화 사용자 평점 - 사용자별 영화평점평균

In [85]:
rating_bias

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.610294,0.389706,-0.610294,-0.610294,1.389706,0.389706,-2.610294,1.389706,-0.610294,...,,,,,,,,,,
2,0.290323,,,,,,,,,-1.709677,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.125714,0.125714,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,0.734694,,...,,,,,,,,,,
940,,,,-1.457944,,,0.542056,1.542056,-0.457944,,...,,,,,,,,,,
941,0.954545,,,,,,-0.045455,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [86]:
rating_mean # series 데이터, pandas 데이터가 아니다.

user_id
1      3.610294
2      3.709677
3      2.796296
4      4.333333
5      2.874286
         ...   
939    4.265306
940    3.457944
941    4.045455
942    4.265823
943    3.410714
Length: 943, dtype: float64

In [87]:
# 위 함수로 score 정의 
print(score(CF_knn_bias))

96.55158048803142


In [88]:
print(score(CF_knn_bias, neighbor_size=30))

1.269812663763227


### 전체 데이터에서 추천(상관계수 유사도)

In [89]:
rating_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')
user_similarity = rating_matrix.T.corr(method='pearson')

In [94]:
def recommender(user, n_items=10, neighbor_size=20):
    # 현재 사용자의 모든 아이템에 대한 예상 평점 계산
    predictions = []
    
    # 이미 사용자가 평가한 영화들의 인덱스를 찾기
    rated_index = rating_matrix.loc[user][rating_matrix.loc[user] > 0].index
    # 사용자가 아직 평가하지 않은 영화들만 가져오기
    items = rating_matrix.loc[user].drop(rated_index)
    
    # 사용자가 평가하지 않은 영화들에 대해 예상 평점을 계산
    for item in items.index:
        predictions.append(CF_knn_bias(user, item, neighbor_size))  # CF_knn_bias 함수를 사용하여 예상 평점 계산
    
    # 예상 평점 결과를 시리즈 형태로 저장 (아이템 인덱스와 매핑)
    recommendations = pd.Series(data=predictions, index=items.index, dtype=float)
    
    # 예상 평점이 가장 높은 n_items 개의 영화 선택
    recommendations = recommendations.sort_values(ascending=False)[:n_items]
    # 추천 영화의 제목을 가져오기
    recommended_items = movies.loc[recommendations.index]['title']
    
    return recommended_items


In [95]:
recommender(user=2, n_items=5, neighbor_size=30)

movie_id
1402    Caro Diario (Dear Diary) (1994)
1321      Metisse (Café au Lait) (1993)
788         Swimming with Sharks (1995)
1288              Jack and Sarah (1995)
1389         Innocent Sleep, The (1995)
Name: title, dtype: object

In [96]:
# user 2가 평가한 영화 데이터
rating_matrix.loc[2]

movie_id
1       4.0
2       NaN
3       NaN
4       NaN
5       NaN
       ... 
1678    NaN
1679    NaN
1680    NaN
1681    NaN
1682    NaN
Name: 2, Length: 1682, dtype: float64

In [97]:
recommender(user=5, n_items=5, neighbor_size=30)

movie_id
1255    Designated Mourner, The (1997)
1275                Sunset Park (1996)
1503          Bewegte Mann, Der (1994)
1639            Eighth Day, The (1996)
1080                    Curdled (1996)
Name: title, dtype: object