# 7-9. 프로젝트: Movielens 영화 추천 실습

movielens 1M dataset을 활용해 추천 시스템을 만들어봤다. 추천받을 사용자는 연습과 같은 이름 대신 movie_id 형태의 임의의 숫자 (9999) 로, 좋아하는 영화 5편은 movie_id로 입력했다. CSR Matrix를 만들기 위해 user_id와 movie_id에 대해 별도의 인덱싱을 진행했다. CSR Matrix를 만들고, AlternatingLeastSquares 모델을 훈련시켜 내가 좋아하는 영화 (토이 스토리 2) 와 비슷한 영화 15편과, 내가 가장 좋아할 만한 영화 20편을 추천받았다. 내가 가장 좋아할 만한 영화 20편을 추천받기 위해, Ratings에 9999 명의로 추가한 영화들은 다음과 같다.

* 1682: Truman Show, The (1998) (count: 5)
* 1721: Titanic (1997) (count: 5)
* 1707: Home Alone 3 (1997) (count: 5)
* 2409: Rocky II (1979) (count: 5)
* 3114: Toy Story 2 (1999) (count: 5)

movies.dat 데이터는 마지막에 이름을 출력할 때만 활용했다. 이를 위해 movies.dat 를 불러올 때 인덱스로 영화 제목을 찾을 수 있는 딕셔너리를 별도로 생성해두었다.

## 필요한 모듈 import

In [1]:
import os

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

## 데이터 준비와 전처리

In [2]:
# rating 데이터 로드
# rating_file_path = os.getenv('HOME') + '/Documents/aiffel_local/20200818/ml-1m/ratings.dat'
rating_file_path = 'ratings.dat'

ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')

orginal_data_size = len(ratings)

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남기기
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# 컬럼 이름 변경
ratings.rename(columns={'rating':'count1'}, inplace=True)

In [5]:
# 영화 제목 데이터 로드
# movie_file_path = os.getenv('HOME') + '/Documents/aiffel_local/20200818/ml-1m/movies.dat'
movie_file_path = 'movies.dat'

cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')

# 영화 이름 출력을 위한 {idx: title, ...} 딕셔너리 임의 생성
idx_to_movie = {}

for idx, movie in enumerate(movies['movie_id']):
    idx_to_movie[movie] = movies['title'][idx]
    
# print(idx_to_movie)
    
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 분석

### ratings에 있는 유니크한 영화 수

In [6]:
ratings['movie_id'].nunique()

3628

### ratings에 있는 유니크한 사용자 수

In [7]:
ratings['user_id'].nunique()

6039

### 가장 인기 있는 영화 30개 (인기순)

In [8]:
movie_count = ratings.groupby('movie_id')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

## 내가 선호하는 영화 5가지 rating에 추가

In [9]:
# Truman Show, The (1998) / Titanic (1997) / Home Alone 3 (1997) / Rocky II (1979) / Toy Story 2 (1999)
my_favorite = [1682, 1721, 1707, 2409, 3114]

my_playlist = pd.DataFrame({'user_id': [9999]*5, 'movie_id': my_favorite, 'count1': [5]*5}) # 위에서 rating -> count1 로 변경했으므로

if not ratings.isin({'user_id':[9999]})['user_id'].any():
    ratings = ratings.append(my_playlist)

ratings.tail(10) # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,count1,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,9999,1682,5,
1,9999,1721,5,
2,9999,1707,5,
3,9999,2409,5,
4,9999,3114,5,


In [10]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [11]:
print(user_to_idx[9999])
print(movie_to_idx[1682]) # Truman Show, The

6039
385


In [12]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# movie_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie_id column indexing Fail!!')
    
ratings

user_id column indexing OK!!
movie_id column indexing OK!!


Unnamed: 0,user_id,movie_id,count1,timestamp
0,0,0,5,978300760.0
1,0,1,3,978302109.0
2,0,2,3,978301968.0
3,0,3,4,978300275.0
4,0,4,5,978824291.0
...,...,...,...,...
0,6039,385,5,
1,6039,27,5,
2,6039,1994,5,
3,6039,1123,5,


## CSR Matrix 만들기

In [13]:
num_user = ratings['user_id'].nunique()
num_movies = ratings['movie_id'].nunique()

print(num_user, num_movies)

csr_data = csr_matrix((ratings.count1, (ratings.user_id, ratings.movie_id)), shape=(num_user, num_movies))

6040 3628


In [14]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [15]:
als_model.fit(csr_data_transpose)

me = user_to_idx[9999]
truman_show = movie_to_idx[1682] # Truman Show, The (1998)

me_vector = als_model.user_factors[me]
truman_show_vector = als_model.item_factors[truman_show]

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [16]:
me_vector

array([-0.5023395 ,  0.47425917, -0.44555765, -0.23234725,  0.71560633,
        0.11919172,  0.24120735,  0.1134606 , -0.28699768,  0.23794885,
        0.15122478,  0.67456746,  0.45145684,  0.10664889,  0.08289762,
        0.40104854,  0.10766234, -0.54468405, -0.54212284,  0.35090855,
       -0.19838512, -0.13492206,  0.18157226, -0.40668172,  0.15376309,
        0.5420676 ,  0.09573819,  0.6261025 ,  0.04298294, -0.2915683 ,
       -0.11186088,  0.49985915, -0.5852413 ,  0.8637775 ,  0.26765132,
        0.32546225,  0.6263474 , -0.05404732,  0.26476517, -0.10040475,
       -1.062659  , -0.13486083,  0.46549273, -0.30968258,  0.26248175,
        0.27486387, -0.8336964 , -0.5985054 , -0.3621482 ,  0.03164356,
       -0.09827969,  0.198316  ,  0.6557962 , -0.7942498 ,  0.6897818 ,
        0.42427132, -0.37774047, -0.28196573,  0.26461554, -0.10634741,
       -0.62180686, -0.04492638, -0.07326837, -1.5329964 , -0.20814385,
        0.15416156, -0.0749208 , -0.4907188 , -0.07181611,  0.09

In [17]:
truman_show_vector

array([-5.65540791e-03,  9.58374795e-03, -8.79791938e-03,  8.02674505e-04,
        2.88799591e-02,  4.76889499e-03,  1.47020230e-02,  1.34645179e-02,
       -5.59536135e-03,  2.36092117e-02,  2.52423640e-02,  3.27905677e-02,
        5.48206689e-03, -3.06742731e-05,  9.01089329e-03,  2.72900183e-02,
        2.44526435e-02,  3.87912645e-04,  1.79112665e-02,  3.02854683e-02,
        1.41862677e-02, -1.35171153e-02,  9.90547519e-03, -1.66605096e-02,
       -1.61456049e-03,  7.83762429e-03,  3.22673633e-03,  2.64177583e-02,
        3.00891115e-03,  1.09127229e-02,  5.34542790e-03, -7.29669072e-03,
        1.79826003e-02,  1.65786110e-02, -1.62819691e-04,  3.26321716e-03,
        5.43234637e-03, -1.82644511e-03,  8.93633161e-03, -2.29558162e-03,
        7.39892945e-03, -1.07527813e-02, -1.00348156e-03,  7.84977246e-03,
        1.11419605e-02,  1.11303581e-02, -2.79943347e-02, -1.39470417e-02,
       -7.77342706e-04,  9.35642142e-03,  9.02976748e-03,  3.58549543e-02,
        1.84565187e-02,  

In [18]:
np.dot(me_vector, truman_show_vector)

0.3520363

## 내가 좋아하는 영화와 비슷한 영화 추천받기

In [19]:
favorite_movie_id = 3114 # Toy Story 2 (1999)
similar_movie = als_model.similar_items(favorite_movie_id, N=15)

for m in similar_movie:
    m = m + (idx_to_movie[m[0]],)
    print(m)

# similar_movie

(3114, 0.021781715, 'Toy Story 2 (1999)')
(2346, 0.015372159, 'Stepford Wives, The (1975)')
(1941, 0.015248158, 'Hamlet (1948)')
(1963, 0.015002224, 'Take the Money and Run (1969)')
(1672, 0.014914121, 'Rainmaker, The (1997)')
(1199, 0.014806441, 'Brazil (1985)')
(3062, 0.014660781, 'Longest Day, The (1962)')
(1962, 0.014557213, 'Driving Miss Daisy (1989)')
(1974, 0.014544688, 'Friday the 13th (1980)')
(1970, 0.014538508, 'Nightmare on Elm Street 3: Dream Warriors, A (1987)')
(2904, 0.014520839, 'Rain (1932)')
(1960, 0.014500704, 'Last Emperor, The (1987)')
(1973, 0.014121532, "Freddy's Dead: The Final Nightmare (1991)")
(2134, 0.014108021, 'Weird Science (1985)')
(2109, 0.013958392, 'Jerk, The (1979)')


## 내가 가장 좋아할 만한 영화 추천받기

In [20]:
user = user_to_idx[9999]
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)

for m in movie_recommended:
    m = m + (idx_to_movie[m[0]], )
    print(m)

# movie_recommended

(40, 0.3369577, 'Cry, the Beloved Country (1995)')
(4, 0.32402563, 'Waiting to Exhale (1995)')
(39, 0.3007994, 'Clueless (1995)')
(384, 0.22715054, 'Bad Company (1995)')
(851, 0.20881493, 'Basquiat (1996)')
(322, 0.20593917, 'Swimming with Sharks (1995)')
(126, 0.20396644, 'NeverEnding Story III, The (1994)')
(626, 0.20281257, 'Thin Line Between Love and Hate, A (1996)')
(60, 0.20138767, 'Indian in the Cupboard, The (1995)')
(1126, 0.19806126, 'Drop Dead Fred (1991)')
(463, 0.18555391, 'Guilty as Sin (1993)')
(38, 0.1794315, 'It Takes Two (1995)')
(474, 0.17410618, 'In the Line of Fire (1993)')
(139, 0.17175257, 'Target (1995)')
(330, 0.161925, 'Tales from the Hood (1995)')
(131, 0.16082264, 'Frankie Starlight (1995)')
(641, 0.15905504, 'Little Indian, Big City (Un indien dans la ville) (1994)')
(110, 0.15694457, 'Braveheart (1995)')
(248, 0.15378532, 'Houseguest (1994)')
(160, 0.15039498, 'Congo (1995)')
