# 데이터 셋 다운로드
- 사용자-영화 평점 행렬 데이터가 필요
- Grouplens 사이트에서 만든 MoviesLens 데이터 셋 사용 (축소 버전 사용)
- https://grouplens.org/datasets/movielens/latest

# 데이터 로딩

In [1]:
import numpy as np
import pandas as pd

movies = pd.read_csv('./dataset/movies.csv')
ratings = pd.read_csv('./dataset/ratings.csv')

print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()
# ratings['userId'].unique().size

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# 데이터 전처리
- 평점 데이터가 행 단위 데이터로 되어있어서 사용자-아이템(영화) 평점 행렬로 변환
- 테이블의 구조 바꾸기 (pivot table)

In [2]:
# 필요없는 컬럼 삭제
ratings = ratings.drop('timestamp', axis=1)

# 사용자가 행으로 가도록 테이블의 구조 바꾸기
ratings_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
print(ratings_matrix.shape)
display(ratings_matrix.head())

(610, 9724)


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


- movieId를 영화 제목으로 변환하기 위해 ratings 데이터와 movies 데이터를 병합
- 사용자-아이템 평점 행렬로 변환
- NaN값은 0으로 변경

In [3]:
rating_movies = pd.merge(ratings, movies, on='movieId')
print(rating_movies.shape)
display(rating_movies.head())

(100836, 5)


Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [5]:
# movieId로 병합한 테이블 구조 바꾸기
ratings_matrix = rating_movies.pivot_table(index='userId', columns='title', values='rating')
print(ratings_matrix.shape)
display(ratings_matrix.head())

(610, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [6]:
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix.head(2)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 영화와 영화 간 평점 유사도 계산
- cosine_similarity()를 이용하여 영화와 영화 간 유사도를 산출하기 위해서는 ratings_matrix 데이터를 **영화를 행 기준으로 만들기 위해** 전치 필요
- 전치 없이 유사도 행렬을 만들면 사용자와 사용자 간의 유사도 행렬이 만들어짐 

In [7]:
ratings_matrix_T = ratings_matrix.T
ratings_matrix_T.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# 영화간 코사인 유사도 값 구하기
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
item_sim_df = pd.DataFrame(item_sim, index=ratings_matrix.columns, columns=ratings_matrix.columns)

print(item_sim_df.shape)
item_sim_df.head(3)

(9719, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 개인화된 영화 추천을 위한 예측 평점 계산

- ratings_arr: 사용자-영화 평점 행렬, shape(610, 9719)
- item_sim_arr: 영화 간 평점 유사도 행렬, shape(9719, 9719)

- ![image.png](attachment:1897f56b-6cd0-41b4-955a-c35b36ce2abe.png)

In [18]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
    pred = np.zeros_like(ratings_arr)  # 예측평점 행렬(R_hat)

    # 영화 개수 만큼 반복 실행
    for col in range(ratings_arr.shape[1]):
        # col번째 영화의 유사도 행렬을 내림차순 정렬하여 상위 n개 영화의 인덱스 추출
        top_n_items = np.argsort(item_sim_arr[:, col])[::-1][:n]
        # 사용자 수 만큼 반복 실행
        for row in range(ratings_arr.shape[0]):
            # item_sim_arr[col, :][top_n_items] -> col번째의 영화와 유사도가 가장 높은 상위 n개 영화의 유사도 벡터 (S_i,n)
            # ratings_arr[row, :][top_n_items] -> row번째 사용자에 col번째 영화와 유사도가 가장 높은 상위 n개의 영화의 실제 벡터(R_u,n)
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T)
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
    return pred

In [15]:
#### 함수 내용 테스트 ####
ratings_arr = ratings_matrix.values
item_sim_arr = item_sim_df.values

# 0번째 영화와 유사도가 높은 상위 20개 영화의 인덱스
top_n_items = np.argsort(item_sim_arr[:, 0])[::-1][:20]
print(top_n_items)
print()

# 인덱스를 통해 평점 유사도 값 추출(S_i,n)
print(item_sim_arr[0, :][top_n_items])
print()

# 상위 20개 영화의 실제 평점 값(R_u,n)
print(ratings_arr[0, :][top_n_items])
print()

print(item_sim_arr[0, :][top_n_items].dot(ratings_arr[0, :][top_n_items].T))

[   0  179 7085 6471 2253 5591 7674 7095 2247 3584 4925 3565 7537 8267
 7676 5111  183 8251 3990  199]

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

0.0


In [None]:
# 개별 예측 평점 계산
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n=20)

In [20]:
# 개별 예측 평점을 데이터 프레임으로
ratings_pred_matrix = pd.DataFrame(ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)
print(ratings_pred_matrix.shape)

(610, 9719)


In [22]:
# 첫번째 유저의 모든 영화에 대한 예측 평점
user1 = ratings_pred_matrix.loc[1]
print(np.max(user1), np.min(user1))

4.595776354211019 0.0


In [23]:
# 첫번째 유저의 9719개의 영화에 대한 예측 평점 중 0점이 아닌 영화의 개수
user1[user1 != 0].size

1133

# 평점을 주지 않은 영화 목록 반환 함수
- 영화 추천은 개인이 아직 관람하지 않은 영화를 추천하는 방식

In [24]:
def get_unseen_movies(ratings_matrix, userId):
    # 유저의 레이팅 정보 (시리즈, 인덱스-영화 제목, 값-영화평점)
    user_rating = ratings_matrix.loc[userId]
    # 관람하지 않은 영화의 제목 리스트 추출
    unseen_list = user_rating[user_rating == 0].index.values
    return unseen_list

# 특정 사용자의 관람하지 않은 영화에 대한 예측 평점 기반 추천

In [25]:
unseen_list = get_unseen_movies(ratings_matrix, 9)
ratings_pred_matrix.loc[9, unseen_list]

title
'71 (2014)                                                0.0
'Hellboy': The Seeds of Creation (2004)                   0.0
'Round Midnight (1986)                                    0.0
'Salem's Lot (2004)                                       0.0
'Til There Was You (1997)                                 0.0
                                                         ... 
anohana: The Flower We Saw That Day - The Movie (2013)    0.0
eXistenZ (1999)                                           0.0
xXx: State of the Union (2005)                            0.0
¡Three Amigos! (1986)                                     0.0
À nous la liberté (Freedom for Us) (1931)                 0.0
Name: 9, Length: 9673, dtype: float64

In [30]:
# 추천 결과를 데이터 프레임으로 반환하는 함수
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    return pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]

unseen_list = get_unseen_movies(ratings_matrix, 9)
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, 10)
recomm_movies = pd.DataFrame(recomm_movies.values, index=recomm_movies.index, columns=['predicted score'])
recomm_movies

Unnamed: 0_level_0,predicted score
title,Unnamed: 1_level_1
Shrek (2001),0.866202
Spider-Man (2002),0.857854
"Last Samurai, The (2003)",0.817473
Indiana Jones and the Temple of Doom (1984),0.816626
"Matrix Reloaded, The (2003)",0.80099
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001),0.765159
Gladiator (2000),0.740956
"Matrix, The (1999)",0.732693
Pirates of the Caribbean: The Curse of the Black Pearl (2003),0.689591
"Lord of the Rings: The Return of the King, The (2003)",0.676711


In [31]:
### 비교 ###
# userid = 9가 평점을 준 영화 중 평점이 높은 10 개 영화 추출 
user_rating = ratings_matrix.loc[9]
user_rating[user_rating > 0].sort_values(ascending=False)[:10]

title
Adaptation (2002)                                                                 5.0
Citizen Kane (1941)                                                               5.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.0
Producers, The (1968)                                                             5.0
Lord of the Rings: The Two Towers, The (2002)                                     5.0
Lord of the Rings: The Fellowship of the Ring, The (2001)                         5.0
Back to the Future (1985)                                                         5.0
Austin Powers in Goldmember (2002)                                                5.0
Minority Report (2002)                                                            4.0
Witness (1985)                                                                    4.0
Name: 9, dtype: float64