# Python을 이용한 개인화 추천시스템

## 3.1 협업 필터링의 원리
어떤 아이템에 대해 비슷한 취향을 가진 사람들은
다르 아이템 또한 비슷한 취향을 가질 것이다.

## 3.2 유사도 지표
CF에서 사용자간 유사도를 구하는 것이 핵심

상관계수
1. 가장 이해하기 쉬운 유사도
2. -1 ~ 1 사이 값

코사인 유사도
1. 협업 필터링에서 가장 널리 쓰이는 유사도
2. 각 아이템 -> 하나의 차원, 사용자의 평가값 -> 자표값
3. 두 사용자의 평가값 유사 -> theta는 작아지고, 코사인 값은 커짐.
4. -1 ~ 1사이의 값
5. 데이터 이진값(binary) -> 타니모토 계수 사용 권장

자카드 계수
1. 타니모토 계수의 변형 -> 자카드 계수
2. 이진수 데이터 -> 좋은 결과

## 3.3 기본 CF 알고리즘
모든 사용자 간 평가의 유사도 계산
-> 추천 대상과 다른 사용자간 유사도 추출
-> 추천 대상이 평가하지 않은 아이템에 대한 예상 평가값 계산(평가값 = 다른 사용자 평가 X 다른 사용자 유사도)
-> 아이템 중에서 예상 평가값 가장 높은 N개 추천

In [42]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user',
                   sep = '|',
                   names = u_cols,
                   encoding='latin-1')
users = users.set_index('user_id')

i_cols = ['movie_id', 'title','release date','video release date',
          'IMDB URL','unknown','Action','Adventure','Animation',
         'Children\s','Comedy','Crime','Documentary','Drama','Fantasy',
         'Film-Noir','Horror','Musical', 'Mystery', 'Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv('ml-100k/u.item',
                    sep ='|',
                    names = i_cols,
                    encoding = 'latin-1')
movies = movies.set_index('movie_id')

r_cols = ['user_id', 'movie_id', 'rating','timestamp']
ratings = pd.read_csv('ml-100k/u.data',
                     sep = '\t',
                     names = r_cols,
                     encoding = 'latin-1')

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

ratings_matrix = x_train.pivot(index = 'user_id', columns = 'movie_id', values = 'rating')

### 코사인 유사도 계산 ###
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                              index = ratings_matrix.index,
                              columns=ratings_matrix.index)

#### 주어진 영화의 (movie_id) 가중평균 rating을 계산하는 함수 ####
def CF_simple(user_id, movie_id):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating

# 정확도 계산
score(CF_simple)

1.0207680686123242

In [36]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.153588,0.034343,0.025147,0.291733,0.334985,0.357728,0.206699,0.067364,0.296527,...,0.304296,0.089391,0.203952,0.183417,0.165031,0.121728,0.235664,0.106015,0.090946,0.266403
2,0.153588,1.000000,0.095934,0.102442,0.058499,0.197393,0.079167,0.074630,0.155207,0.125954,...,0.140095,0.261094,0.234133,0.257326,0.198946,0.221628,0.165253,0.042738,0.112710,0.058466
3,0.034343,0.095934,1.000000,0.319918,0.030029,0.067080,0.048767,0.065922,0.092143,0.015086,...,0.029812,0.020104,0.154607,0.041830,0.078619,0.036477,0.141440,0.092143,0.042448,0.037421
4,0.025147,0.102442,0.319918,1.000000,0.000000,0.034741,0.066846,0.073101,0.000000,0.038239,...,0.024488,0.049541,0.109042,0.054975,0.064304,0.000000,0.097267,0.012975,0.122304,0.000000
5,0.291733,0.058499,0.030029,0.000000,1.000000,0.142454,0.282504,0.172428,0.040556,0.139998,...,0.252593,0.036132,0.065703,0.017184,0.090191,0.045787,0.173129,0.116600,0.070422,0.208631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.121728,0.221628,0.036477,0.000000,0.045787,0.085102,0.107733,0.092803,0.000000,0.041823,...,0.055098,0.320470,0.224466,0.224194,0.410398,1.000000,0.062692,0.079826,0.039603,0.118562
940,0.235664,0.165253,0.141440,0.097267,0.173129,0.269230,0.219868,0.169173,0.153633,0.263056,...,0.238951,0.111667,0.159680,0.106660,0.148657,0.062692,1.000000,0.120315,0.154274,0.173650
941,0.106015,0.042738,0.092143,0.012975,0.116600,0.093410,0.017152,0.060281,0.118519,0.043661,...,0.027960,0.098991,0.253507,0.081601,0.152491,0.079826,0.120315,1.000000,0.036749,0.108299
942,0.090946,0.112710,0.042448,0.122304,0.070422,0.220282,0.186775,0.058662,0.058798,0.135379,...,0.131779,0.078577,0.066233,0.080967,0.059775,0.039603,0.154274,0.036749,1.000000,0.109696


## 3.4 이웃을 고려한 CF
단순 CF 알고리즘 개선 방법
1. K Nearest Neighbors (KNN 방법)
2. Thresholding 방법

In [58]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user',
                   sep = '|',
                   names = u_cols,
                   encoding='latin-1')
users = users.set_index('user_id')

i_cols = ['movie_id', 'title','release date','video release date',
          'IMDB URL','unknown','Action','Adventure','Animation',
         'Children\s','Comedy','Crime','Documentary','Drama','Fantasy',
         'Film-Noir','Horror','Musical', 'Mystery', 'Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv('ml-100k/u.item',
                    sep ='|',
                    names = i_cols,
                    encoding = 'latin-1')
movies = movies.set_index('movie_id')

r_cols = ['user_id', 'movie_id', 'rating','timestamp']
ratings = pd.read_csv('ml-100k/u.data',
                     sep = '\t',
                     names = r_cols,
                     encoding = 'latin-1')

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

def score(model,neighbor_size = 0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

ratings_matrix = x_train.pivot(index = 'user_id', columns = 'movie_id', values = 'rating')

### 코사인 유사도 계산 ###
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                              index = ratings_matrix.index,
                              columns=ratings_matrix.index)

### Neighbor size를 정해서 예측치를 계산하는 함수 ####
def CF_knn(user_id, movie_id, neighbor_size = 0):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    return mean_rating

# 정확도 계산
score(CF_knn, neighbor_size = 30)

1.0136360675361538

In [76]:
# 실제 주어진 사용자에 대해 추천을 받는 기능 구현
rating_matrix = ratings.pivot(values = 'rating',
                             index = 'user_id',
                             columns = 'movie_id')
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                              index = rating_matrix.index,
                              columns = rating_matrix.index,)

def recom_movie(user_id, n_items, neighbor_size = 30):
    user_movie = rating_matrix.loc[user_id].copy()
    
    for movie in rating_matrix.columns:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
        else:
            user_movie.loc[movie] = CF_knn(user_id, movie, neighbor_size)
    
    movie_sort = user_movie.sort_values(ascending = False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id = 729, n_items = 5, neighbor_size = 30)

  movie_sort = user_movie.sort_values(ascending = False)[:n_items]


movie_id
1467    Saint of Fort Washington, The (1993)
1367                            Faust (1994)
1191         Letter From Death Row, A (1998)
1500               Santa with Muscles (1996)
22                         Braveheart (1995)
Name: title, dtype: object

## 3.5 최적의 이웃 크기 결정

In [77]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user',
                   sep = '|',
                   names = u_cols,
                   encoding='latin-1')
users = users.set_index('user_id')

i_cols = ['movie_id', 'title','release date','video release date',
          'IMDB URL','unknown','Action','Adventure','Animation',
         'Children\s','Comedy','Crime','Documentary','Drama','Fantasy',
         'Film-Noir','Horror','Musical', 'Mystery', 'Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv('ml-100k/u.item',
                    sep ='|',
                    names = i_cols,
                    encoding = 'latin-1')
movies = movies.set_index('movie_id')

r_cols = ['user_id', 'movie_id', 'rating','timestamp']
ratings = pd.read_csv('ml-100k/u.data',
                     sep = '\t',
                     names = r_cols,
                     encoding = 'latin-1')

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

def score(model,neighbor_size = 0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

ratings_matrix = x_train.pivot(index = 'user_id', columns = 'movie_id', values = 'rating')

### 코사인 유사도 계산 ###
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                              index = ratings_matrix.index,
                              columns=ratings_matrix.index)

### Neighbor size를 정해서 예측치를 계산하는 함수 ####
def CF_knn(user_id, movie_id, neighbor_size = 0):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    return mean_rating

for neighbor_size in [10,20,30,40,50,60]:
    print('Neighbor size = %d : RMSE = %.4f'%(neighbor_size, score(CF_knn,neighbor_size)))

Neighbor size = 10 : RMSE = 1.0277
Neighbor size = 20 : RMSE = 1.0132
Neighbor size = 30 : RMSE = 1.0098
Neighbor size = 40 : RMSE = 1.0093
Neighbor size = 50 : RMSE = 1.0097
Neighbor size = 60 : RMSE = 1.0104
