# Contents-Based Recommender System

## 콘텐츠 기반 추천 시스템

In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

In [2]:
movies_df = pd.read_csv("datasets/moviedataset/movies.csv")

In [3]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


## Item-Feature matrix 작성

- title 의 year 를 별도 column 으로 분리
- title 에서 white space 제거
- genres 를 별도 column 으로 분리 후 one-hot-encoding

In [4]:
# year column 추가
movies_df['year'] = movies_df.title.str.extract('(\d\d\d\d)', expand=False)
movies_df.head(2)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995


In [5]:
# title 에서 (year) 제거
movies_df['title'] = movies_df['title'].str.replace('(\(\d\d\d\d\))', '')

# title data cleansing
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

# genres 를 list element로 분리
movies_df['genres'] = movies_df['genres'].str.split('|')
movies_df.head(3)

  


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995


### 영화를 장르별로 one-hot-encoding

In [6]:
movies_genre_df = movies_df.copy()

for i, row in movies_df.iterrows():
    for genre in row['genres']:
        movies_genre_df.at[i, genre] = 1

movies_genre_df.fillna(0, inplace=True)
movies_genre_df.head(3)

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Content-based (Item-item) recommendation systems for Single User

- 위에서 구한 item-feature matrix 를 user-item rating과 내적하여 유사 item 추천
- 사용자가 평가한 contents (user profile) 를 이용하여 user-item rating 작성 
- genre 가 같은 movie 를 유사 item 으로 간주

In [7]:
# user profile (user input) 작성 

userInput = [
    {'title': 'Breakfast Club, The', 'rating': 5},
    {'title': 'Toy Story', 'rating': 3.5},
    {'title': 'Jumanji', 'rating': 2},
    {'title': 'Pulp Fiction', 'rating': 5},
    {'title': 'Akira', 'rating': 4.5}
]

# 특정 user 가 평가한 movie table
user_rated_movies = pd.DataFrame(userInput)
user_rated_movies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [8]:
user_rated_movies = pd.merge(user_rated_movies, movies_genre_df, on='title')
user_rated_movies

Unnamed: 0,title,rating,movieId,genres,year,Adventure,Animation,Children,Comedy,Fantasy,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,"Breakfast Club, The",5.0,1968,"[Comedy, Drama]",1985,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Toy Story,3.5,1,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jumanji,2.0,2,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Pulp Fiction,5.0,296,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Akira,4.5,1274,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# weighted genre matrix (user profile) 작성에 불필요한 columns 제거
user_genre_table = user_rated_movies.drop(['title', 'movieId', 'genres', 'year'], axis=1)
user_genre_table

Unnamed: 0,rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.5,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 각 장르에 대한 user 의 선호도 profile 작성

- user-item rating 과 item-feature matrix 를 내적하여 user의 feature 에 대한 선호도 작성

In [10]:
user_profile = user_genre_table.iloc[:, 1:].transpose() \
                .dot(user_genre_table.iloc[:, 0])
user_profile

Adventure             10.0
Animation              8.0
Children               5.5
Comedy                13.5
Fantasy                5.5
Romance                0.0
Drama                 10.0
Action                 4.5
Crime                  5.0
Thriller               5.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

### movies_genre_df 와 user_profile 을 이용하여 조건에 맞는 top 20 moives 를 추천한다.

- 사용자가 부여한 영화에 대한 평점을 genre 에 대한 평점으로 전환하여 사용자의 장르별 선호도를 user profile 로 작성

In [11]:
# movieId 로 index reset
movies_genre_df.set_index(movies_genre_df['movieId'], inplace=True)

# 불필요한 columns 삭제
genre_table = movies_genre_df.drop(['title', 'movieId', 'genres', 'year'], axis=1)
print(genre_table.shape)
genre_table.head(3)

(34208, 20)


Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### genre_table 을 weight(user_profile) 로 곱하고 가중평균을 구한다. 

In [12]:
final_recommend = (genre_table * user_profile).sum(axis=1) / user_profile.sum()
final_recommend.tail()

movieId
151697    0.069930
151701    0.000000
151703    0.139860
151709    0.202797
151711    0.000000
dtype: float64

## final_recommend 를 내림차순으로 정렬 하여 top 10 선정

In [13]:
movies_df.loc[movies_df['movieId']\
              .isin(final_recommend.sort_values(ascending=False)\
              .head(10).index)]

Unnamed: 0,movieId,title,genres,year
2902,2987,Who Framed Roger Rabbit?,"[Adventure, Animation, Children, Comedy, Crime...",1988
4923,5018,Motorama,"[Adventure, Comedy, Crime, Drama, Fantasy, Mys...",1991
6793,6902,Interstate 60,"[Adventure, Comedy, Drama, Fantasy, Mystery, S...",2002
8605,26093,"Wonderful World of the Brothers Grimm, The","[Adventure, Animation, Children, Comedy, Drama...",1962
9296,27344,Revolutionary Girl Utena: Adolescence of Utena...,"[Action, Adventure, Animation, Comedy, Drama, ...",1999
13250,64645,The Wrecking Crew,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1968
16055,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010
25218,117646,Dragonheart 2: A New Beginning,"[Action, Adventure, Comedy, Drama, Fantasy, Th...",2000
26442,122787,The 39 Steps,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1959
33509,148775,Wizards of Waverly Place: The Movie,"[Adventure, Children, Comedy, Drama, Fantasy, ...",2009


# Tensorflow 를 이용한 Content-based Filtering 구현 for Many Users

- 사용자, 영화 및 feature 목록은 위에서 가져옵니다.

In [14]:
users = ['Ryan', 'Danielle',  'Vijay', 'Chris']

movies = [
    'Jumanji', 'The Dark Knight', 'Shrek',
    'Grand Slam', 'Toy Story', 'Memento'
]

# Each row represents a user's rating for the different movies.
users_movies = [[4,  6,  8,  0, 0, 0],
                [0,  0, 10,  0, 8, 3],
                [0,  6,  0,  0, 3, 7],
                [10, 9,  0,  5, 0, 2]]

In [15]:
pd.DataFrame(data=users_movies, columns=movies, index=users)

Unnamed: 0,Jumanji,The Dark Knight,Shrek,Grand Slam,Toy Story,Memento
Ryan,4,6,8,0,0,0
Danielle,0,0,10,0,8,3
Vijay,0,6,0,0,3,7
Chris,10,9,0,5,0,2


- item-feature matrix 작성

In [44]:
features = movies_genre_df[movies_genre_df['title']\
                .isin(movies)].iloc[:, 4:].columns
features

Index(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance',
       'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi',
       'IMAX', 'Documentary', 'War', 'Musical', 'Western', 'Film-Noir',
       '(no genres listed)'],
      dtype='object')

In [17]:
movie_feats = movies_genre_df[movies_genre_df['title']\
                    .isin(movies)].iloc[:, 4:].values
print(movie_feats)

[[1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [19]:
users_movies    = tf.constant(users_movies, dtype=tf.float32)
movie_feats = tf.constant(movie_feats, dtype=tf.float32)
print(users_movies)
print(movie_feats)

tf.Tensor(
[[ 4.  6.  8.  0.  0.  0.]
 [ 0.  0. 10.  0.  8.  3.]
 [ 0.  6.  0.  0.  3.  7.]
 [10.  9.  0.  5.  0.  2.]], shape=(4, 6), dtype=float32)
tf.Tensor(
[[1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(6, 20), dtype=float32)


### Computing the user feature matrix

사용자-특성 매트릭스를 계산합니다. 즉, 5 차원 특성 공간에 각 사용자의 임베딩을 포함하는 행렬입니다.  

이것을 `users_movies` 텐서와 `movies_feats` 텐서의 행렬 곱셈으로 계산합니다.

In [24]:
users_feats = tf.matmul(users_movies, movie_feats)

print(users_feats)

tf.Tensor(
[[10.  4. 10.  4. 10.  0.  0.  0.  0.  8.  0.  8.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  8.  8.  8. 21.  0. 10.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 6.  0.  6.  0.  6.  0.  3.  3.  3. 10.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [24. 15. 24. 15. 24.  5.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]], shape=(4, 20), dtype=float32)


다음으로 각 사용자 특성 벡터를 정규화하여 1 이 되도록 정규화합니다. 

정규화는 꼭 필요한 것은 아니지만 사용자간에 등급 규모를 비교할 수 있도록 합니다.

In [26]:
users_feats = users_feats / \
            tf.reduce_sum(users_feats, axis=1, keepdims=True)
users_feats

<tf.Tensor: shape=(4, 20), dtype=float32, numpy=
array([[0.18518518, 0.07407407, 0.18518518, 0.07407407, 0.18518518,
        0.        , 0.        , 0.        , 0.        , 0.14814815,
        0.        , 0.14814815, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.14545454, 0.14545454, 0.14545454, 0.38181818,
        0.        , 0.18181819, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.16216217, 0.        , 0.16216217, 0.        , 0.16216217,
        0.        , 0.08108108, 0.08108108, 0.08108108, 0.2702703 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.22018349, 0.13761468, 0.22018349, 0.13761468, 0.22018349,
        0.04587156, 0.        , 0.        , 0.        , 0.018348

#### Ranking feature relevance for each user

위에서 계산된 users_feats 를 사용하여 각 사용자에 대한 각 영화 카테고리의 상대적 중요성을 나타낼 수 있습니다.

In [49]:
# top_users_features = tf.nn.top_k(users_feats, movie_feats.shape[1])[1]
top_users_features = tf.nn.top_k(users_feats, 5)[1]
top_users_features

<tf.Tensor: shape=(4, 5), dtype=int32, numpy=
array([[ 0,  2,  4,  9, 11],
       [ 9, 11,  6,  7,  8],
       [ 9,  0,  2,  4,  6],
       [ 0,  2,  4,  1,  3]], dtype=int32)>

In [50]:
for i in range(len(users)):  
    feature_names = [features[int(idx)] for idx in top_users_features[i]]
    print(f"{users[i]} : {feature_names}")

Ryan : ['Adventure', 'Children', 'Fantasy', 'Thriller', 'Mystery']
Danielle : ['Thriller', 'Mystery', 'Drama', 'Action', 'Crime']
Vijay : ['Thriller', 'Adventure', 'Children', 'Fantasy', 'Drama']
Chris : ['Adventure', 'Children', 'Fantasy', 'Animation', 'Comedy']


### Determining movie recommendations. 

이제 위에서 계산 한 `users_feats` 텐서를 사용하여 각 사용자에 대한 영화 등급 및 추천을 결정합니다.

각 영화의 예상 등급을 계산하기 위해 user feature vector와 해당 movie feature vector 간의 유사성 측정값을 계산합니다.

내적을 유사성 척도로 사용할 것입니다. 본질적으로 이것은 각 사용자에 대한 가중 영화 평균입니다.

이것을 행렬 곱셈으로 구현합니다. 피연산자 중 하나를 전치해야합니다.

In [52]:
users_ratings = tf.matmul(users_feats, tf.transpose(movie_feats))
users_ratings

<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
array([[0.70370364, 0.5555555 , 0.2962963 , 0.70370364, 0.14814815,
        0.14814815],
       [0.        , 0.        , 0.56363636, 0.        , 0.81818175,
        0.38181818],
       [0.4864865 , 0.4864865 , 0.2702703 , 0.4864865 , 0.51351357,
        0.2702703 ],
       [0.9357798 , 0.6605505 , 0.01834862, 0.9816514 , 0.01834862,
        0.01834862]], dtype=float32)>

위의 계산은 데이터베이스의 각 사용자와 각 영화 간의 유사성 측정값을 찾습니다. 새 영화의 등급에만 집중하기 위해 all_users_ratings 매트릭스에 마스크를 적용합니다. 사용자가 이미 영화를 평가한 경우 해당 평가는 무시됩니다. 이렇게 하면 이전에 본 적이 없거나 등급이 지정되지 않은 영화에 대한 등급에만 집중합니다.

In [53]:
users_unseen_movies = tf.equal(users_movies, tf.zeros_like(users_movies))
users_unseen_movies

<tf.Tensor: shape=(4, 6), dtype=bool, numpy=
array([[False, False, False,  True,  True,  True],
       [ True,  True, False,  True, False, False],
       [ True, False,  True,  True, False, False],
       [False, False,  True, False,  True, False]])>

In [57]:
ignore_matrix = tf.zeros_like(tf.cast(users_movies, tf.float32))

users_ratings_new = tf.where(
    users_unseen_movies,
    users_ratings,
    ignore_matrix)

users_ratings_new

<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
array([[0.        , 0.        , 0.        , 0.70370364, 0.14814815,
        0.14814815],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.4864865 , 0.        , 0.2702703 , 0.4864865 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.01834862, 0.        , 0.01834862,
        0.        ]], dtype=float32)>

마지막으로 각 사용자에 대해 상위 2 개 등급의 영화를 가져와 인쇄해 보겠습니다.

In [61]:
top_movies = tf.nn.top_k(users_ratings_new, 2)[1]
top_movies

<tf.Tensor: shape=(4, 2), dtype=int32, numpy=
array([[3, 4],
       [0, 1],
       [0, 3],
       [2, 4]], dtype=int32)>

In [62]:
for i in range(len(users)):
    movie_names = [movies[idx] for idx in top_movies[i]]
    print(f"{users[i]} : {movie_names}")

Ryan : ['Grand Slam', 'Toy Story']
Danielle : ['Jumanji', 'The Dark Knight']
Vijay : ['Jumanji', 'Grand Slam']
Chris : ['Shrek', 'Toy Story']
