# Movielens 영화 추천
## MF 모델 학습 방법을 토대로, 영화 추천 시스템 제작
별점 데이터는 대표적인 explicit(직접적)데이터 이지만 implicit(간접적)데이터로 간주하고 테스트 할 수 있음  
별점 = 시청횟수로 해석하여 진행  
3점 미만 데이터는 비선호 데이터로 가정, 제외

In [1]:
# mkdir -p ~/aiffel/recommendata_iu/data/ml-1m
# ln -s ~/data/ml-1m/* ~/aiffel/recommendata_iu/data/ml-1m

In [2]:
import pandas as pd
import os
import numpy as np
import re
import scipy
from scipy.sparse import csr_matrix
import implicit
from implicit.als import AlternatingLeastSquares

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


## 데이터 준비와 전처리
Movielens 데이터는 rating.dat 안에 이미 인덱싱까지 완료된 사용자-영화-평점 데이터가 정리되어있음

In [3]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


3점 미만의 데이터는 제외

In [4]:
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


별점을 시청 횟수로 간주하기로 했으므로 컬럼 이름 바꾸기

In [5]:
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [6]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

영화 제목을 보기 위해 메타 데이터를 불러옴

In [7]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


두 데이터를 movie_id 기준으로 병합

In [8]:
ratings.info(), movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    836478 non-null  int64
 1   movie_id   836478 non-null  int64
 2   counts     836478 non-null  int64
 3   timestamp  836478 non-null  int64
dtypes: int64(4)
memory usage: 31.9 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genre     3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


(None, None)

In [9]:
data = pd.merge(ratings, movies, how='left', left_on='movie_id', right_on='movie_id')
data.sample(10)

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
355592,2599,2143,3,973796493,Legend (1985),Adventure|Fantasy|Romance
413792,3050,674,5,970179340,Barbarella (1968),Adventure|Sci-Fi
267140,1908,1584,5,974698500,Contact (1997),Drama|Sci-Fi
27579,220,3178,4,976836514,"Hurricane, The (1999)",Drama
170033,1260,802,3,992268393,Phenomenon (1996),Drama|Romance
64924,521,1267,5,976198729,"Manchurian Candidate, The (1962)",Film-Noir|Thriller
449266,3313,3418,4,968381726,Thelma & Louise (1991),Action|Drama
270444,1928,349,4,975189024,Clear and Present Danger (1994),Action|Adventure|Thriller
725873,5246,3361,3,961373927,Bull Durham (1988),Comedy
832665,6011,2001,5,956787267,Lethal Weapon 2 (1989),Action|Comedy|Crime|Drama


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 836477
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    836478 non-null  int64 
 1   movie_id   836478 non-null  int64 
 2   counts     836478 non-null  int64 
 3   timestamp  836478 non-null  int64 
 4   title      836478 non-null  object
 5   genre      836478 non-null  object
dtypes: int64(4), object(2)
memory usage: 44.7+ MB


결측치 없음
필요한 컬럼만 가져와서 재지정

In [11]:
data = data[['user_id','movie_id','counts','title','genre']]
data

Unnamed: 0,user_id,movie_id,counts,title,genre
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,My Fair Lady (1964),Musical|Romance
3,1,3408,4,Erin Brockovich (2000),Drama
4,1,2355,5,"Bug's Life, A (1998)",Animation|Children's|Comedy
...,...,...,...,...,...
836473,6040,1090,3,Platoon (1986),Drama|War
836474,6040,1094,5,"Crying Game, The (1992)",Drama|Romance|War
836475,6040,562,5,Welcome to the Dollhouse (1995),Comedy|Drama
836476,6040,1096,4,Sophie's Choice (1982),Drama


title에 있는 출시년도를 뜻하는 괄호 안 숫자를 제거

In [12]:
title = data['title']
new_title = []
for v in title:
    nt = re.sub(r'\([^)]*\)', '', v)
    new_title.append(nt)
data['title'] = new_title
data

Unnamed: 0,user_id,movie_id,counts,title,genre
0,1,1193,5,One Flew Over the Cuckoo's Nest,Drama
1,1,661,3,James and the Giant Peach,Animation|Children's|Musical
2,1,914,3,My Fair Lady,Musical|Romance
3,1,3408,4,Erin Brockovich,Drama
4,1,2355,5,"Bug's Life, A",Animation|Children's|Comedy
...,...,...,...,...,...
836473,6040,1090,3,Platoon,Drama|War
836474,6040,1094,5,"Crying Game, The",Drama|Romance|War
836475,6040,562,5,Welcome to the Dollhouse,Comedy|Drama
836476,6040,1096,4,Sophie's Choice,Drama


In [13]:
print('중복되지 않은 유니크한 사용자 수 : ', data['user_id'].nunique())
print('중복되지 않은 유니크한 영화 갯수 : ', data['title'].nunique())
print()
print('가장 인기있는 영화 30개')
movie_count = data.groupby(['title','movie_id'])['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

중복되지 않은 유니크한 사용자 수 :  6039
중복되지 않은 유니크한 영화 갯수 :  3585

가장 인기있는 영화 30개


title                                            movie_id
American Beauty                                  2858        3211
Star Wars: Episode IV - A New Hope               260         2910
Star Wars: Episode V - The Empire Strikes Back   1196        2885
Star Wars: Episode VI - Return of the Jedi       1210        2716
Saving Private Ryan                              2028        2561
Terminator 2: Judgment Day                       589         2509
Silence of the Lambs, The                        593         2498
Raiders of the Lost Ark                          1198        2473
Back to the Future                               1270        2460
Matrix, The                                      2571        2434
Jurassic Park                                    480         2413
Sixth Sense, The                                 2762        2385
Fargo                                            608         2371
Braveheart                                       110         2314
Men in Black      

In [14]:
data['user_id'].nunique(), data['movie_id'].nunique(), len(data.user_id), len(data.movie_id)

(6039, 3628, 836478, 836478)

### 모델 검증을 위한 사용자 초기 세팅
마치 유튜브 뮤직에 처음 가입했을 때 선호하는 아티스트 정보를 입력하는것과 같은 과정

In [15]:
my_movie_title = ['Jurassic Park ' , 'Toy Story ' ,'Men in Black ' ,'Matrix, The ' ,'Saving Private Ryan ']
my_movie_id = []
my_genre = []
for title in my_movie_title:
    id = data[data['title']==title].movie_id.unique()
    genres = data[data['title']==title].genre.unique()
    for genre in genres:
        my_genre.append(genre)
    my_movie_id.append(int(id))

# '6041'이라는 새로운 user가 위의 영화를 봤다고 가정
my_movielist = pd.DataFrame({'user_id':['6041']*5, 'movie_id':my_movie_id, 'counts':['3','5','4','5','3'], 'title':my_movie_title, 'genre':my_genre})

if not data.isin({'user_id':['6041']})['user_id'].any():
    data = data.append(my_movielist) 
    
data.tail(7)

Unnamed: 0,user_id,movie_id,counts,title,genre
836476,6040,1096,4,Sophie's Choice,Drama
836477,6040,1097,4,E.T. the Extra-Terrestrial,Children's|Drama|Fantasy|Sci-Fi
0,6041,480,3,Jurassic Park,Action|Adventure|Sci-Fi
1,6041,1,5,Toy Story,Animation|Children's|Comedy
2,6041,1580,4,Men in Black,Action|Adventure|Comedy|Sci-Fi
3,6041,2571,5,"Matrix, The",Action|Sci-Fi|Thriller
4,6041,2028,3,Saving Private Ryan,Action|Drama|War


In [16]:
data = data[['user_id','movie_id','counts']]
data

Unnamed: 0,user_id,movie_id,counts
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
0,6041,480,3
1,6041,1,5
2,6041,1580,4
3,6041,2571,5


In [17]:
data['user_id'].nunique(), data['movie_id'].nunique(), len(data.user_id), len(data.movie_id)

(6040, 3628, 836483, 836483)

In [18]:
from scipy.sparse import csr_matrix

num_user = data['user_id'].nunique()
num_movie = data['movie_id'].nunique()

csr_data = csr_matrix((data['counts'].astype(np.int), (data.user_id, data.movie_id)), shape=(num_user+2, num_movie+325))
csr_data

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  csr_data = csr_matrix((data['counts'].astype(np.int), (data.user_id, data.movie_id)), shape=(num_user+2, num_movie+325))


<6042x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

num_user +2 num_movie+325 밑으로는 에러가 뜬다

### MF 모델 학습하기
Matrix Factorization 모델을 implicit 패키지를 이용해 학습  
위의 과정처럼 암묵적(implicit) 데이터셋을 사용하는 다양한 모델을 빠르게 학습할 수 있는 패키지  
패키지에 구현된 als(AlternationgLeastSquares) 모델 사용  
MF에서 쪼개진 두 Feature Matrix를 한쪽은 고정시키고 다른 쪽을 학습하는 방식을 번갈아 수행

In [19]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [20]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [21]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6042 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [22]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [34]:
user = 6041
movie = 1      # Toy Story
user_vector, movie_vector = als_model.user_factors[user], als_model.item_factors[movie]

In [35]:
user_vector

array([ 1.0256681 , -0.00348916, -0.36095428, -0.5286749 , -0.57654184,
        0.02609121, -0.20178549, -0.25296095, -0.2567285 ,  0.28880322,
        0.610086  , -0.6497106 , -0.21046607,  0.05835624, -0.0802822 ,
       -0.38197026,  0.10947122,  0.45736456,  0.296073  , -0.2642908 ,
        0.2088239 , -0.5415798 ,  0.11347053,  0.14451443, -0.06700634,
       -0.17925969,  0.4323083 ,  0.41024047, -0.9691238 ,  0.01975654,
       -0.28790605, -0.28314927, -0.2583255 ,  0.06768577, -0.6274585 ,
        0.6301588 ,  0.20145544,  0.16751915,  0.4358338 , -0.45215982,
        0.48178202, -0.8781431 ,  0.25808343,  0.9931191 , -0.08109269,
       -0.05837012, -0.884421  ,  0.9056777 ,  0.69303894, -0.31462926,
       -0.27359655,  0.21334545, -0.58204556,  0.4593435 , -0.3189723 ,
       -0.22287013, -0.31219733,  0.4350675 , -0.5430344 ,  0.4792688 ,
        0.35904044, -0.3482078 , -0.29912996,  0.11341998,  0.08395538,
        0.129358  ,  0.5940903 , -0.12450498,  0.18765602,  0.81

In [36]:
movie_vector

array([ 0.02064591, -0.00454573,  0.0066245 , -0.01947825, -0.03881786,
        0.01159518, -0.0214951 ,  0.0036694 , -0.03294116,  0.03139752,
        0.02950642, -0.01283004, -0.00158269,  0.02339784,  0.02203074,
       -0.01234919, -0.01056923, -0.01088313,  0.01124414,  0.0002881 ,
        0.03237402,  0.01654196,  0.01836842,  0.01445614, -0.0080814 ,
       -0.00072291,  0.03302842,  0.01459708,  0.02785322, -0.0056983 ,
       -0.01432318, -0.00611616,  0.00344266, -0.01528202,  0.00422884,
       -0.00757982,  0.00521856, -0.03630692, -0.00140209,  0.0014673 ,
        0.01454916, -0.01106599,  0.01561792,  0.02417636, -0.03072781,
        0.01047079, -0.03197246,  0.01534805,  0.0262821 ,  0.00505753,
        0.02519928,  0.02814554, -0.02975091,  0.00382239,  0.01545324,
       -0.00066674,  0.01881661,  0.02845848, -0.01111969,  0.01625575,
       -0.00022475, -0.02436059, -0.01024219, -0.00189158,  0.00172557,
        0.01106865,  0.02301583, -0.01340605,  0.02491821,  0.02

In [37]:
np.dot(user_vector, movie_vector)

0.472652

In [38]:
movie1 = 2571   # Matrix, The
user_vector, movie1_vector = als_model.user_factors[user], als_model.item_factors[movie1]

In [39]:
user_vector

array([ 1.0256681 , -0.00348916, -0.36095428, -0.5286749 , -0.57654184,
        0.02609121, -0.20178549, -0.25296095, -0.2567285 ,  0.28880322,
        0.610086  , -0.6497106 , -0.21046607,  0.05835624, -0.0802822 ,
       -0.38197026,  0.10947122,  0.45736456,  0.296073  , -0.2642908 ,
        0.2088239 , -0.5415798 ,  0.11347053,  0.14451443, -0.06700634,
       -0.17925969,  0.4323083 ,  0.41024047, -0.9691238 ,  0.01975654,
       -0.28790605, -0.28314927, -0.2583255 ,  0.06768577, -0.6274585 ,
        0.6301588 ,  0.20145544,  0.16751915,  0.4358338 , -0.45215982,
        0.48178202, -0.8781431 ,  0.25808343,  0.9931191 , -0.08109269,
       -0.05837012, -0.884421  ,  0.9056777 ,  0.69303894, -0.31462926,
       -0.27359655,  0.21334545, -0.58204556,  0.4593435 , -0.3189723 ,
       -0.22287013, -0.31219733,  0.4350675 , -0.5430344 ,  0.4792688 ,
        0.35904044, -0.3482078 , -0.29912996,  0.11341998,  0.08395538,
        0.129358  ,  0.5940903 , -0.12450498,  0.18765602,  0.81

In [40]:
movie1_vector

array([ 2.49709077e-02,  1.69807207e-02,  3.34392581e-03, -4.22837958e-03,
       -3.72319780e-02, -7.93248974e-03,  1.44082948e-03,  9.93881561e-03,
       -1.11183105e-02, -3.81588680e-03,  6.68503251e-03, -3.38916259e-04,
       -1.09196780e-02,  1.15152039e-02, -1.19086951e-02, -2.41642520e-02,
        1.47283748e-02,  1.95760168e-02,  1.01522515e-02, -5.11284871e-03,
        1.26832596e-03, -2.19278224e-02,  1.27624925e-02,  1.42680639e-02,
        1.20877332e-04, -1.17670195e-02,  1.26247318e-03,  2.23286022e-02,
       -5.08100027e-03,  1.23929011e-03,  9.78018157e-03,  1.80701166e-02,
        3.98643920e-03,  1.42480033e-02, -1.22574642e-02,  2.45908909e-02,
        1.06587997e-02,  4.02338132e-02,  1.78884920e-02, -2.42181472e-03,
        2.37325467e-02, -2.45931987e-02,  3.83211160e-03,  3.34273912e-02,
       -5.24072489e-03,  3.55626736e-03, -2.41291840e-02,  1.75568350e-02,
        1.47465235e-02, -4.68457164e-03, -1.19906105e-02,  4.82625765e-04,
        3.99727002e-03,  

In [41]:
np.dot(user_vector, movie1_vector)

0.60937756