# Movielens 데이터를 이용한 추천시스템

- 별점 데이터이므로 explicit data지만, 우리는 implicit data를 다뤄봤으므로 별점을 시청횟수로 간주하여 implicit data로 접근해보자.
- 유저가 3점 미만으로 준 데이터는 선호하지 않는다고 가정

In [69]:
import os
import pandas as pd
import numpy as np

#### ratings.dat

In [46]:
rating_file_path = os.getenv('HOME')+'/aiffel/recommendation_system/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding='ISO-8859-1')
original_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [47]:
# 3점 이상만 남기기
ratings = ratings[ratings['rating'] >= 3]
filtered_data_size = len(ratings)

print(f'Ratio of remaining data is {filtered_data_size / original_data_size:.2%}')

Ratio of remaining data is 83.63%


In [49]:
# 우리는 rating을 시청횟수로 생각할 것이므로 rating 칼럼을 count로 바꿔주자.
ratings.rename(columns={'rating' : 'count'}, inplace=True)
ratings['count']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: count, Length: 836478, dtype: int64

#### movies.dat

In [51]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [54]:
# {movie_id : title}
movies_unique = movies['title'].unique()

title_to_idx = {k:v for k, v in enumerate(movies_unique)}

In [55]:
num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

print(num_user, num_movie)

6039 3628


In [56]:
# 인기 많은 영화 30개
movie_count = ratings.groupby('movie_id')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

In [58]:
my_favorite = ['260', '2028', '608', '296', '1']

my_movies = pd.DataFrame({'user_id' : ['9999']*5, 'movie_id' : my_favorite, 'count':[5]*5, 'timestamp':[0]*5})

if not ratings.isin({'user_id':['9999']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    ratings = ratings.append(my_movies)     

In [59]:
ratings.tail(10)

Unnamed: 0,user_id,movie_id,count,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569
0,9999,260,5,0
1,9999,2028,5,0
2,9999,608,5,0
3,9999,296,5,0
4,9999,1,5,0


In [60]:
from scipy.sparse import csr_matrix

In [63]:
csr_data = csr_matrix((ratings.count, (ratings.user_id, ratings.movie_id)))
csr_data

TypeError: '>=' not supported between instances of 'int' and 'str'

- 오류를 보아 int로 다 바꿔줘야한다.

In [64]:
ratings = ratings.astype('int64')

In [66]:
csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)))
csr_data

<10000x3953 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [67]:
from implicit.als import AlternatingLeastSquares

os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['MKL_NUM_THREADS'] = '1'

In [70]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [71]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x10000 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [72]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [73]:
name_to_vector, movie_to_vector = als_model.user_factors[9999], als_model.item_factors[260]

In [74]:
np.dot(name_to_vector, movie_to_vector)

0.5198139

In [75]:
# 선호도 파악
similar_movie = als_model.similar_items(260, N=15)
similar_movie

[(260, 1.0000001),
 (1196, 0.8876241),
 (1210, 0.7562817),
 (1198, 0.7111374),
 (1214, 0.46726176),
 (1240, 0.4659844),
 (2628, 0.4614407),
 (1270, 0.44360092),
 (2571, 0.42550832),
 (1197, 0.40159968),
 (1097, 0.3939608),
 (1291, 0.35868925),
 (1387, 0.34333226),
 (589, 0.33526966),
 (541, 0.33388522)]

In [78]:
[title_to_idx[i[0]] for i in similar_movie]

['Ladybird Ladybird (1994)',
 'Alien (1979)',
 'Raging Bull (1980)',
 'Big Blue, The (Le Grand Bleu) (1988)',
 'Boat, The (Das Boot) (1981)',
 'M (1931)',
 'My Son the Fanatic (1998)',
 'Some Kind of Wonderful (1987)',
 'Superman (1978)',
 'Army of Darkness (1993)',
 'Associate, The (1996)',
 'Santa with Muscles (1996)',
 'Evening Star, The (1996)',
 'Silence of the Lambs, The (1991)',
 'Harlem (1993)']

In [79]:
# 영화 추천

movies_recommended = als_model.recommend(9999, csr_data, N=20, filter_already_liked_items=True)
movies_recommended

[(593, 0.62580645),
 (318, 0.5899359),
 (527, 0.5802281),
 (1196, 0.46954814),
 (3114, 0.4580477),
 (1213, 0.42548728),
 (1210, 0.41909817),
 (1198, 0.41574883),
 (1617, 0.39795887),
 (50, 0.3849525),
 (2858, 0.36790097),
 (1704, 0.33251455),
 (110, 0.3262998),
 (34, 0.325401),
 (2762, 0.28656784),
 (1097, 0.26302195),
 (2571, 0.26159346),
 (1197, 0.25286502),
 (2355, 0.25283933),
 (1265, 0.2380803)]

In [81]:
[title_to_idx[i[0]] for i in movies_recommended]

['Pretty Woman (1990)',
 'Strawberry and Chocolate (Fresa y chocolate) (1993)',
 'Secret Garden, The (1993)',
 'Alien (1979)',
 'Third Miracle, The (1999)',
 'Stalker (1979)',
 'Raging Bull (1980)',
 'Big Blue, The (Le Grand Bleu) (1988)',
 'Stripes (1981)',
 'Guardian Angel (1994)',
 'Brief Encounter (1946)',
 'Duoluo tianshi (1995)',
 'Rumble in the Bronx (1995)',
 'Carrington (1995)',
 'Dog of Flanders, A (1999)',
 'Associate, The (1996)',
 'Superman (1978)',
 'Army of Darkness (1993)',
 "You've Got Mail (1998)",
 'Heathers (1989)']