# ex13 Movielens 영화 추천 실습

In [1]:
!mkdir -p ~/aiffel/recommendata_iu/data/ml-1m
!ln -s ~/data/ml-1m/* ~/aiffel/recommendata_iu/data/ml-1m

In [3]:
import numpy as np
import scipy
import implicit
import pandas as pd

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


## 1. 데이터 준비와 전처리

In [4]:
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [7]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [26]:
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [9]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [40]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].max() + 1
num_movie = ratings['movie_id'].max() + 1

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<6041x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836478 stored elements in Compressed Sparse Row format>

In [41]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [42]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [43]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6041 sparse matrix of type '<class 'numpy.int64'>'
	with 836478 stored elements in Compressed Sparse Column format>

## 모델 학습

In [44]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [45]:
user1_vector, toy_story_vector = als_model.user_factors[1], als_model.item_factors[1]

슝=3


In [46]:
user1_vector

array([-0.6139967 ,  1.2662429 ,  0.3964201 ,  0.37509346, -0.48948687,
       -1.5260522 ,  1.6581547 ,  0.46468788, -0.531208  , -0.8403216 ,
       -3.1204972 ,  1.554016  , -0.21398914, -0.6820759 ,  0.8435321 ,
       -2.4634304 , -0.06328645, -2.4341068 ,  1.0405439 , -1.1872238 ,
        0.549266  ,  0.7418129 , -0.6704238 ,  1.1972985 , -0.72698647,
       -0.02144951, -0.72788686,  0.09332278, -0.9894771 ,  2.7587578 ,
       -3.112682  ,  1.516209  ,  0.9464694 , -1.8118715 , -3.1407664 ,
       -0.44259694,  1.3809568 ,  0.37579858,  1.7874764 , -0.01386982,
        0.5660911 , -0.02640108, -1.8046908 ,  1.0638103 ,  2.2211494 ,
       -0.12774628,  2.0575616 ,  0.64689237, -0.05507768, -0.10672973,
        1.0863272 ,  0.719922  ,  0.20921569,  0.6165476 , -0.12590647,
       -1.0960401 , -0.5615411 , -0.43465105,  1.3450342 , -0.7997159 ,
       -1.4195479 ,  1.302099  ,  0.42988026,  2.0024588 , -1.1580858 ,
       -1.4472017 ,  1.6273243 ,  0.6826957 ,  0.5635984 ,  1.74

In [47]:
toy_story_vector

array([-5.65902749e-03, -5.04862331e-03,  2.78998800e-02, -2.60457426e-04,
        3.44134271e-02,  3.92439067e-02, -2.10888386e-02, -4.22189012e-03,
        1.56075554e-02, -5.87280747e-03, -1.77517291e-02,  2.32057329e-02,
        5.06772287e-03,  1.19405491e-02,  3.35058644e-02, -2.85080099e-03,
        9.80758294e-03, -6.75578089e-03,  1.07590500e-02, -7.36257061e-03,
        1.36834709e-02,  2.27084998e-02, -3.71667072e-02, -3.14338598e-03,
       -2.57714326e-03,  1.48251411e-02,  1.66983567e-02,  2.12091269e-04,
       -2.50332849e-03,  3.64856794e-02, -6.22806977e-03,  3.87108363e-02,
       -8.08892399e-03, -2.45942604e-02, -1.22535266e-02,  8.62705614e-03,
       -9.85500123e-03,  1.52726127e-02,  8.17087479e-03,  5.45446936e-04,
        1.84281101e-03, -1.32058822e-02, -6.15353370e-03, -2.35753972e-02,
        4.43009511e-02,  1.56514961e-02,  4.15489115e-02,  2.36965697e-02,
        6.97480422e-03, -1.07568048e-03, -1.56871416e-02,  2.91356947e-02,
       -8.07427056e-03,  

In [48]:
np.dot(user1_vector, toy_story_vector)

1.0426809

## 비슷한 영화 찾기

In [57]:
movie_id = 1
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(1, 0.9999999),
 (3114, 0.7811959),
 (2355, 0.6104324),
 (588, 0.5942017),
 (34, 0.55956805),
 (1265, 0.5363133),
 (364, 0.45662773),
 (595, 0.44064108),
 (1923, 0.4402374),
 (2321, 0.4274344),
 (356, 0.38397446),
 (2396, 0.3737098),
 (1566, 0.35113513),
 (2687, 0.34377003),
 (3253, 0.3354167)]

In [58]:
for id,sim in similar_movie:
    name = movies[movies['movie_id'] == id]['title']
    print(name,sim) 

0    Toy Story (1995)
Name: title, dtype: object 0.9999999
3045    Toy Story 2 (1999)
Name: title, dtype: object 0.7811959
2286    Bug's Life, A (1998)
Name: title, dtype: object 0.6104324
584    Aladdin (1992)
Name: title, dtype: object 0.5942017
33    Babe (1995)
Name: title, dtype: object 0.55956805
1245    Groundhog Day (1993)
Name: title, dtype: object 0.5363133
360    Lion King, The (1994)
Name: title, dtype: object 0.45662773
591    Beauty and the Beast (1991)
Name: title, dtype: object 0.44064108
1854    There's Something About Mary (1998)
Name: title, dtype: object 0.4402374
2252    Pleasantville (1998)
Name: title, dtype: object 0.4274344
352    Forrest Gump (1994)
Name: title, dtype: object 0.38397446
2327    Shakespeare in Love (1998)
Name: title, dtype: object 0.3737098
1526    Hercules (1997)
Name: title, dtype: object 0.35113513
2618    Tarzan (1999)
Name: title, dtype: object 0.34377003
3184    Wayne's World (1992)
Name: title, dtype: object 0.3354167


## 유저에게 아티스트 추천하기

In [60]:
user = 1
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(364, 0.9941819),
 (318, 0.8516393),
 (1282, 0.8361454),
 (1225, 0.7929288),
 (2081, 0.7607373),
 (2078, 0.7534429),
 (2858, 0.72022706),
 (1704, 0.70363724),
 (2096, 0.6897839),
 (34, 0.67700446),
 (596, 0.6700983),
 (110, 0.6424839),
 (2087, 0.6303146),
 (593, 0.61390173),
 (3751, 0.60175186),
 (1198, 0.58576113),
 (1032, 0.5829715),
 (1196, 0.5714934),
 (2137, 0.57134604),
 (1259, 0.5679973)]

In [61]:
for id,sim in movie_recommended:
    name = movies[movies['movie_id'] == id]['title']
    print(name,sim) 

360    Lion King, The (1994)
Name: title, dtype: object 0.9941819
315    Shawshank Redemption, The (1994)
Name: title, dtype: object 0.8516393
1262    Fantasia (1940)
Name: title, dtype: object 0.8361454
1207    Amadeus (1984)
Name: title, dtype: object 0.7929288
2012    Little Mermaid, The (1989)
Name: title, dtype: object 0.7607373
2009    Jungle Book, The (1967)
Name: title, dtype: object 0.7534429
2789    American Beauty (1999)
Name: title, dtype: object 0.72022706
1656    Good Will Hunting (1997)
Name: title, dtype: object 0.70363724
2027    Sleeping Beauty (1959)
Name: title, dtype: object 0.6897839
33    Babe (1995)
Name: title, dtype: object 0.67700446
592    Pinocchio (1940)
Name: title, dtype: object 0.6700983
108    Braveheart (1995)
Name: title, dtype: object 0.6424839
2018    Peter Pan (1953)
Name: title, dtype: object 0.6303146
589    Silence of the Lambs, The (1991)
Name: title, dtype: object 0.61390173
3682    Chicken Run (2000)
Name: title, dtype: object 0.60175186
118