In [108]:
import os
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from tabulate import tabulate

In [25]:
# 환경설정

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

pd.options.display.float_format = '{:.2f}'.format

---

**데이터 준비**

In [26]:
# rating 데이터 불러오기

file = os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings = pd.read_csv(file, names = ['ratings'])
ratings.head()

Unnamed: 0,ratings
0,1::1193::5::978300760
1,1::661::3::978302109
2,1::914::3::978301968
3,1::3408::4::978300275
4,1::2355::5::978824291


**RATINGS FILE DESCRIPTION** <br/>
<br/>
All ratings are contained in the file "ratings.dat" and are in the following format: <br/>
<br/>
UserID::MovieID::Rating::Timestamp <br/>
- UserIDs range between 1 and 6040 
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings

In [27]:
# rating 데이터 col 분할

ratings['user_id'] = ratings.ratings.str.split('::').str[0].astype(int)
ratings['movie_id'] = ratings.ratings.str.split('::').str[1].astype(int)
ratings['rating'] = ratings.ratings.str.split('::').str[2].astype(int)
ratings['timestamp'] = ratings.ratings.str.split('::').str[3].astype(int)

ratings.drop(['ratings'], axis = 1, inplace = True)

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [28]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   user_id    1000209 non-null  int64
 1   movie_id   1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [29]:
ratings.nunique()

user_id        6040
movie_id       3706
rating            5
timestamp    458455
dtype: int64

In [30]:
ratings['rating'].describe()

count   1000209.00
mean          3.58
std           1.12
min           1.00
25%           3.00
50%           4.00
75%           4.00
max           5.00
Name: rating, dtype: float64

* 평균 3.6, 중앙 4.0으로 다소 높음

---

In [31]:
# movie 데이터 불러오기 

file = os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
movies = pd.read_csv(file, names = ['movies'], encoding = 'ISO-8859-1', sep = '\t')
movies.head()

Unnamed: 0,movies
0,1::Toy Story (1995)::Animation|Children's|Comedy
1,2::Jumanji (1995)::Adventure|Children's|Fantasy
2,3::Grumpier Old Men (1995)::Comedy|Romance
3,4::Waiting to Exhale (1995)::Comedy|Drama
4,5::Father of the Bride Part II (1995)::Comedy


**MOVIES FILE DESCRIPTION**

Movie information is in the file "movies.dat" and is in the following format:

MovieID::Title::Genres

- Titles are identical to titles provided by the IMDB (includingyear of release)
- Genres are pipe-separated and are selected from the following genres:

    Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western
    
    

- Some MovieIDs do not correspond to a movie due to accidental duplicate entries and/or test entries
- Movies are mostly entered by hand, so errors and inconsistencies may exist

In [32]:
# movie 데이터 col 분할

movies['movie_id'] = movies.movies.str.split('::').str[0].astype(int)
movies['title'] = movies.movies.str.split('::').str[1]
movies['genres'] = movies.movies.str.split('::').str[2].str.split('|')
movies.drop(['movies'], axis = 1, inplace = True)
movies.set_index('movie_id', inplace = True)

movies.head()

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),"[Animation, Children's, Comedy]"
2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,Waiting to Exhale (1995),"[Comedy, Drama]"
5,Father of the Bride Part II (1995),[Comedy]


In [33]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3883 entries, 1 to 3952
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   3883 non-null   object
 1   genres  3883 non-null   object
dtypes: object(2)
memory usage: 91.0+ KB


In [34]:
movies['title'].nunique()

3883

---

**인기있는 영화**

In [35]:
# 단순 rating이 많은 영화 Top 5

count = ratings.groupby('movie_id')['user_id'].count()
count = count.sort_values(ascending = False).head(5)

most_view = movies[movies.index.isin(count.index)]
most_view = pd.concat([most_view, count], axis = 1)
most_view.sort_values('user_id', ascending = False)

Unnamed: 0_level_0,title,genres,user_id
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2858,American Beauty (1999),"[Comedy, Drama]",3428
260,Star Wars: Episode IV - A New Hope (1977),"[Action, Adventure, Fantasy, Sci-Fi]",2991
1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Drama, Sci-Fi, War]",2990
1210,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Romance, Sci-Fi, War]",2883
480,Jurassic Park (1993),"[Action, Adventure, Sci-Fi]",2672


In [36]:
# rating 5점이 많은 영화 Top 5

rating = ratings[ratings['rating'] == 5]
rating = rating.groupby('movie_id')['user_id'].count()
rating = rating.sort_values(ascending = False).head(5)

most_rate = movies[movies.index.isin(rating.index)]
most_rate = pd.concat([most_rate, rating], axis = 1)
most_rate.sort_values('user_id', ascending = False)

Unnamed: 0_level_0,title,genres,user_id
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2858,American Beauty (1999),"[Comedy, Drama]",1963
260,Star Wars: Episode IV - A New Hope (1977),"[Action, Adventure, Fantasy, Sci-Fi]",1826
1198,Raiders of the Lost Ark (1981),"[Action, Adventure]",1500
1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Drama, Sci-Fi, War]",1483
858,"Godfather, The (1972)","[Action, Crime, Drama]",1475


In [37]:
# rating 합이 가장 큰 영화 Top 5

rating = ratings.groupby('movie_id')['rating'].sum()
rating = rating.sort_values(ascending = False).head(5)

most_rate = movies[movies.index.isin(rating.index)]
most_rate = pd.concat([most_rate, rating], axis = 1)
most_rate.sort_values('rating', ascending = False)

Unnamed: 0_level_0,title,genres,rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2858,American Beauty (1999),"[Comedy, Drama]",14800
260,Star Wars: Episode IV - A New Hope (1977),"[Action, Adventure, Fantasy, Sci-Fi]",13321
1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Drama, Sci-Fi, War]",12836
1210,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Romance, Sci-Fi, War]",11598
2028,Saving Private Ryan (1998),"[Action, Drama, War]",11507


---

**데이터 확인**

In [38]:
print('사용자 수:', ratings['user_id'].nunique())
print('작품 수(전체):', movies['title'].nunique())
print('작품 수(별점):', ratings['movie_id'].nunique())

사용자 수: 6040
작품 수(전체): 3883
작품 수(별점): 3706


In [39]:
print('사용자별 평균 작품 수')
count = ratings.groupby('user_id')['movie_id'].count()
count.describe()

사용자별 평균 작품 수


count   6040.00
mean     165.60
std      192.75
min       20.00
25%       44.00
50%       96.00
75%      208.00
max     2314.00
Name: movie_id, dtype: float64

In [40]:
print('사용자별 중앙 별점 수')
median = ratings.groupby('user_id')['rating'].median()
median.describe()

사용자별 중앙 별점 수


count   6040.00
mean       3.84
std        0.58
min        1.00
25%        4.00
50%        4.00
75%        4.00
max        5.00
Name: rating, dtype: float64

---

**데이터 학습 준비**

In [41]:
# rating 3 이상의 데이터만 implicit 하게 사용

implicit = ratings[ratings['rating'] >= 3].copy()
implicit.drop('timestamp', axis = 1, inplace = True)

print(f'size: {len(implicit)} (total {len(ratings)}), ratio: {len(implicit)/len(ratings):.2%}')

size: 836478 (total 1000209), ratio: 83.63%


In [42]:
# 결과 확인을 위한 검증 데이터 추가

test_user_id = 9999
test_movie_id = [339, 597, 2424, 2671, 916]
test_rating = [5, 5, 4, 4, 3]

if not implicit.isin({'user_id':[test_user_id]})['user_id'].any():
    implicit = implicit.append(pd.DataFrame({'user_id': [test_user_id] * 5, 'movie_id': test_movie_id, 'rating': test_rating}))


implicit.reset_index(inplace = True)
implicit.tail(10)

Unnamed: 0,index,user_id,movie_id,rating
836473,1000203,6040,1090,3
836474,1000205,6040,1094,5
836475,1000206,6040,562,5
836476,1000207,6040,1096,4
836477,1000208,6040,1097,4
836478,0,9999,339,5
836479,1,9999,597,5
836480,2,9999,2424,4
836481,3,9999,2671,4
836482,4,9999,916,3


In [43]:
# 추가한 영화 데이터 (Comedy | Romance)

movies[movies.index.isin(test_movie_id)]

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
339,While You Were Sleeping (1995),"[Comedy, Romance]"
597,Pretty Woman (1990),"[Comedy, Romance]"
916,Roman Holiday (1953),"[Comedy, Romance]"
2424,You've Got Mail (1998),"[Comedy, Romance]"
2671,Notting Hill (1999),"[Comedy, Romance]"


---

In [44]:
# 데이터 index를 맞추기 위해 dict으로 재구성

user_list = implicit['user_id'].unique()
movie_list = implicit['movie_id'].unique()

user_list = {v:k for k, v in enumerate(user_list)}
movie_list = {v:k for k, v in enumerate(movie_list)}

temp = implicit['user_id'].map(user_list.get)
if len(temp) == len(implicit):
    implicit['user_id'] = temp
else:
    print('too long/short user_list length')

temp = implicit['movie_id'].map(movie_list.get)
if len(temp) == len(implicit):
    implicit['movie_id'] = temp
else:
    print('too long/short movie_list length')

implicit

Unnamed: 0,index,user_id,movie_id,rating
0,0,0,0,5
1,1,0,1,3
2,2,0,2,3
3,3,0,3,4
4,4,0,4,5
...,...,...,...,...
836478,0,6039,579,5
836479,1,6039,336,5
836480,2,6039,626,4
836481,3,6039,1151,4


In [45]:
# CSR(compressed Sparse Row) matrix 생성

csr = csr_matrix((implicit.rating, (implicit.user_id, implicit.movie_id)), 
                  shape = (implicit['user_id'].nunique(), implicit['movie_id'].nunique()))

csr

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [46]:
max(implicit.user_id), max(implicit.movie_id) # row index, col index

(6039, 3627)

In [47]:
implicit['user_id'].nunique(), implicit['movie_id'].nunique() # M, N

(6040, 3628)

---

**모델 학습**

In [48]:
model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [49]:
model.fit(csr.T)

  0%|          | 0/15 [00:00<?, ?it/s]

In [51]:
# 검증 데이터가 1에 가까운지 확인

test_user, test_movie = user_list[9999], movie_list[test_movie_id[0]]
test_user, test_movie = model.user_factors[test_user], model.item_factors[test_movie]

np.dot(test_user, test_movie)

0.4459605

In [54]:
# 검증 데이터와 유사한 영화가 1에 가까운지 확인 (Comedy | Romance)

test_movie = movie_list[1569]
test_movie = model.item_factors[test_movie]

np.dot(test_user, test_movie)

0.44597003

In [75]:
movies.loc[1569] # 검증 데이터와 유사한 영화 (Comedy | Romance)

title     My Best Friend's Wedding (1997)
genres                  [Comedy, Romance]
Name: 1569, dtype: object

---

**하이퍼파라미터 비교**

In [57]:
for factors in [50, 100, 200, 400, 800]:
    for regularization in [0.5, 0.1, 0.05, 0.01, 0.001]:
        for iterations in [10, 20, 40, 80, 160]:
            model = AlternatingLeastSquares(factors = factors, regularization = regularization, use_gpu = False, 
                                            iterations = iterations, dtype = np.float32)
            model.fit(csr.T)
            
            test_user, test_movie = user_list[9999], movie_list[test_movie_id[0]]
            test_user, test_movie = model.user_factors[test_user], model.item_factors[test_movie]

            score1 = np.dot(test_user, test_movie)
            
            test_movie = movie_list[1569]
            test_movie = model.item_factors[test_movie]

            np.dot(test_user, test_movie)
            
            score2 = np.dot(test_user, test_movie)
            
            print('Param (factors, regularization, iterations) :', [factors, regularization, iterations])
            print('Watched Movie :', score1)
            print('Similar Movie :', score2)

  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.5, 10]
Watched Movie : 0.35979092
Similar Movie : 0.35356736


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.5, 20]
Watched Movie : 0.35023254
Similar Movie : 0.35920498


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.5, 40]
Watched Movie : 0.35422483
Similar Movie : 0.35700962


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.5, 80]
Watched Movie : 0.34573326
Similar Movie : 0.35413682


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.5, 160]
Watched Movie : 0.34612447
Similar Movie : 0.35369283


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.1, 10]
Watched Movie : 0.34339723
Similar Movie : 0.3661665


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.1, 20]
Watched Movie : 0.33559105
Similar Movie : 0.36278424


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.1, 40]
Watched Movie : 0.35496128
Similar Movie : 0.35512075


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.1, 80]
Watched Movie : 0.35041815
Similar Movie : 0.3579586


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.1, 160]
Watched Movie : 0.345754
Similar Movie : 0.3558509


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.05, 10]
Watched Movie : 0.34331316
Similar Movie : 0.35539347


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.05, 20]
Watched Movie : 0.35041687
Similar Movie : 0.35405025


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.05, 40]
Watched Movie : 0.346111
Similar Movie : 0.35288027


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.05, 80]
Watched Movie : 0.3497906
Similar Movie : 0.35539612


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.05, 160]
Watched Movie : 0.34698075
Similar Movie : 0.35611668


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.01, 10]
Watched Movie : 0.35954204
Similar Movie : 0.35546318


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.01, 20]
Watched Movie : 0.3601952
Similar Movie : 0.36638686


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.01, 40]
Watched Movie : 0.3400777
Similar Movie : 0.35291448


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.01, 80]
Watched Movie : 0.34485704
Similar Movie : 0.35422325


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.01, 160]
Watched Movie : 0.3494839
Similar Movie : 0.35597935


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.001, 10]
Watched Movie : 0.34987596
Similar Movie : 0.3517198


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.001, 20]
Watched Movie : 0.35269058
Similar Movie : 0.3711193


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.001, 40]
Watched Movie : 0.34549564
Similar Movie : 0.35757643


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.001, 80]
Watched Movie : 0.35683176
Similar Movie : 0.3571589


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [50, 0.001, 160]
Watched Movie : 0.34977624
Similar Movie : 0.35696834


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.5, 10]
Watched Movie : 0.42511505
Similar Movie : 0.44469318


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.5, 20]
Watched Movie : 0.44460875
Similar Movie : 0.44355842


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.5, 40]
Watched Movie : 0.44188407
Similar Movie : 0.4522418


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.5, 80]
Watched Movie : 0.44815242
Similar Movie : 0.43852782


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.5, 160]
Watched Movie : 0.44006196
Similar Movie : 0.43974134


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.1, 10]
Watched Movie : 0.43577677
Similar Movie : 0.40395275


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.1, 20]
Watched Movie : 0.43086794
Similar Movie : 0.40995502


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.1, 40]
Watched Movie : 0.43274233
Similar Movie : 0.44414577


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.1, 80]
Watched Movie : 0.43961498
Similar Movie : 0.43192282


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.1, 160]
Watched Movie : 0.44454086
Similar Movie : 0.44211966


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.05, 10]
Watched Movie : 0.42728
Similar Movie : 0.42846927


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.05, 20]
Watched Movie : 0.4382084
Similar Movie : 0.44097975


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.05, 40]
Watched Movie : 0.44078964
Similar Movie : 0.43141705


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.05, 80]
Watched Movie : 0.44843453
Similar Movie : 0.44079846


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.05, 160]
Watched Movie : 0.43750635
Similar Movie : 0.4348011


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.01, 10]
Watched Movie : 0.41832596
Similar Movie : 0.44297472


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.01, 20]
Watched Movie : 0.44572434
Similar Movie : 0.43449846


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.01, 40]
Watched Movie : 0.434379
Similar Movie : 0.44095644


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.01, 80]
Watched Movie : 0.43785766
Similar Movie : 0.44434005


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.01, 160]
Watched Movie : 0.44173887
Similar Movie : 0.44242212


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.001, 10]
Watched Movie : 0.41540647
Similar Movie : 0.4305654


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.001, 20]
Watched Movie : 0.43719596
Similar Movie : 0.44798636


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.001, 40]
Watched Movie : 0.44253147
Similar Movie : 0.43891427


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.001, 80]
Watched Movie : 0.44538382
Similar Movie : 0.4373239


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [100, 0.001, 160]
Watched Movie : 0.441675
Similar Movie : 0.4417902


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.5, 10]
Watched Movie : 0.54459167
Similar Movie : 0.49001294


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.5, 20]
Watched Movie : 0.54699767
Similar Movie : 0.5120277


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.5, 40]
Watched Movie : 0.5465216
Similar Movie : 0.45757687


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.5, 80]
Watched Movie : 0.54755354
Similar Movie : 0.49054703


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.5, 160]
Watched Movie : 0.5377681
Similar Movie : 0.48178223


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.1, 10]
Watched Movie : 0.5355636
Similar Movie : 0.47475263


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.1, 20]
Watched Movie : 0.5274301
Similar Movie : 0.4885345


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.1, 40]
Watched Movie : 0.5576816
Similar Movie : 0.45337218


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.1, 80]
Watched Movie : 0.5342103
Similar Movie : 0.47352266


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.1, 160]
Watched Movie : 0.53178626
Similar Movie : 0.46000192


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.05, 10]
Watched Movie : 0.54315615
Similar Movie : 0.45563516


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.05, 20]
Watched Movie : 0.5384383
Similar Movie : 0.4742951


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.05, 40]
Watched Movie : 0.54836845
Similar Movie : 0.42755094


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.05, 80]
Watched Movie : 0.5460563
Similar Movie : 0.4968629


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.05, 160]
Watched Movie : 0.5380746
Similar Movie : 0.44799617


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.01, 10]
Watched Movie : 0.5495789
Similar Movie : 0.44569674


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.01, 20]
Watched Movie : 0.52381873
Similar Movie : 0.46495688


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.01, 40]
Watched Movie : 0.5436852
Similar Movie : 0.4949538


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.01, 80]
Watched Movie : 0.54679215
Similar Movie : 0.47954106


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.01, 160]
Watched Movie : 0.54261
Similar Movie : 0.46739462


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.001, 10]
Watched Movie : 0.5415375
Similar Movie : 0.48175976


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.001, 20]
Watched Movie : 0.54959613
Similar Movie : 0.4903317


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.001, 40]
Watched Movie : 0.54688954
Similar Movie : 0.48445463


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.001, 80]
Watched Movie : 0.5481525
Similar Movie : 0.48715097


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [200, 0.001, 160]
Watched Movie : 0.5361376
Similar Movie : 0.4757027


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.5, 10]
Watched Movie : 0.6877123
Similar Movie : 0.4395159


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.5, 20]
Watched Movie : 0.66813713
Similar Movie : 0.42568865


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.5, 40]
Watched Movie : 0.6794936
Similar Movie : 0.44038722


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.5, 80]
Watched Movie : 0.68049103
Similar Movie : 0.43121442


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.5, 160]
Watched Movie : 0.6749505
Similar Movie : 0.46388653


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.1, 10]
Watched Movie : 0.6707843
Similar Movie : 0.3868341


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.1, 20]
Watched Movie : 0.6714319
Similar Movie : 0.43563926


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.1, 40]
Watched Movie : 0.67736286
Similar Movie : 0.42448193


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.1, 80]
Watched Movie : 0.68326604
Similar Movie : 0.47013152


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.1, 160]
Watched Movie : 0.67302483
Similar Movie : 0.45405725


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.05, 10]
Watched Movie : 0.6552832
Similar Movie : 0.40832862


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.05, 20]
Watched Movie : 0.6721684
Similar Movie : 0.41507027


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.05, 40]
Watched Movie : 0.6837978
Similar Movie : 0.45085838


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.05, 80]
Watched Movie : 0.6818132
Similar Movie : 0.44806498


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.05, 160]
Watched Movie : 0.6812652
Similar Movie : 0.4575988


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.01, 10]
Watched Movie : 0.6722829
Similar Movie : 0.44364074


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.01, 20]
Watched Movie : 0.6763884
Similar Movie : 0.4327399


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.01, 40]
Watched Movie : 0.6673678
Similar Movie : 0.43347174


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.01, 80]
Watched Movie : 0.6784806
Similar Movie : 0.43484557


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.01, 160]
Watched Movie : 0.67772573
Similar Movie : 0.4685341


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.001, 10]
Watched Movie : 0.6912684
Similar Movie : 0.42637944


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.001, 20]
Watched Movie : 0.6693901
Similar Movie : 0.43330908


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.001, 40]
Watched Movie : 0.67762995
Similar Movie : 0.44927844


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.001, 80]
Watched Movie : 0.6801577
Similar Movie : 0.44553152


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [400, 0.001, 160]
Watched Movie : 0.67399794
Similar Movie : 0.43704706


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.5, 10]
Watched Movie : 0.85359573
Similar Movie : 0.23931077


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.5, 20]
Watched Movie : 0.8626086
Similar Movie : 0.23419479


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.5, 40]
Watched Movie : 0.87006414
Similar Movie : 0.25095952


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.5, 80]
Watched Movie : 0.8783004
Similar Movie : 0.23177141


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.5, 160]
Watched Movie : 0.87630826
Similar Movie : 0.23250887


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.1, 10]
Watched Movie : 0.83255494
Similar Movie : 0.27018496


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.1, 20]
Watched Movie : 0.86555874
Similar Movie : 0.23773006


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.1, 40]
Watched Movie : 0.87652886
Similar Movie : 0.21753779


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.1, 80]
Watched Movie : 0.8797228
Similar Movie : 0.23510766


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.1, 160]
Watched Movie : 0.8821753
Similar Movie : 0.23286963


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.05, 10]
Watched Movie : 0.84498686
Similar Movie : 0.26195225


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.05, 20]
Watched Movie : 0.86616683
Similar Movie : 0.2466284


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.05, 40]
Watched Movie : 0.8681395
Similar Movie : 0.24278006


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.05, 80]
Watched Movie : 0.87875736
Similar Movie : 0.23120117


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.05, 160]
Watched Movie : 0.8798429
Similar Movie : 0.22683339


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.01, 10]
Watched Movie : 0.86581576
Similar Movie : 0.24993414


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.01, 20]
Watched Movie : 0.860794
Similar Movie : 0.24175894


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.01, 40]
Watched Movie : 0.8708178
Similar Movie : 0.24865371


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.01, 80]
Watched Movie : 0.8750226
Similar Movie : 0.24151683


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.01, 160]
Watched Movie : 0.8805644
Similar Movie : 0.22688435


  0%|          | 0/10 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.001, 10]
Watched Movie : 0.86182684
Similar Movie : 0.22079948


  0%|          | 0/20 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.001, 20]
Watched Movie : 0.86398447
Similar Movie : 0.22429946


  0%|          | 0/40 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.001, 40]
Watched Movie : 0.87271047
Similar Movie : 0.23689686


  0%|          | 0/80 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.001, 80]
Watched Movie : 0.881379
Similar Movie : 0.23739904


  0%|          | 0/160 [00:00<?, ?it/s]

Param (factors, regularization, iterations) : [800, 0.001, 160]
Watched Movie : 0.8790514
Similar Movie : 0.22865742


**Similar Movie 벡터 내적이 가장 클 때**<br/>
Param (factors, regularization, iterations) : [200, 0.5, 20]<br/>
Watched Movie : 0.54699767<br/>
Similar Movie : 0.5120277<br/>


**Watched Movie 벡터 내적이 가장 클 때**<br/>
Param (factors, regularization, iterations) : [800, 0.001, 80]<br/>
Watched Movie : 0.881379<br/>
Similar Movie : 0.23739904<br/>

* **Similar Movie 벡터 내적이 가장 클 때** 모두 0.5 이상으로 적절한 모델이라 생각됨
* **Watched Movie 벡터 내적이 가장 클 때** Watched movie / Similar movie 차이가 커 과적합된 것으로 생각됨

---

**유사 item 찾기**

In [65]:
model1 = AlternatingLeastSquares(factors = 200, regularization = 0.5, use_gpu = False, 
                                iterations = 20, dtype = np.float32)
model2 = AlternatingLeastSquares(factors = 800, regularization = 0.001, use_gpu = False, 
                                iterations = 80, dtype = np.float32)

model1.fit(csr.T)
model2.fit(csr.T)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

In [113]:
def similar_movie(model, movie_id, movie_list = movie_list):
    movie = movie_list[movie_id]
    movie = model.similar_items(movie)
    
    movie_list = {v:k for k, v in movie_list.items()}
    
    movie = [(movie_list[i[0]], i[1]) for i in movie]
    movie = [(movies.loc[i[0]]['title'], movies.loc[i[0]]['genres'], i[1]) for i in movie]
    
    print(tabulate(movie, headers = ['title', 'genres', 'score']))

In [114]:
movies[movies.index.isin([339])] # 비교할 영화

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
339,While You Were Sleeping (1995),"[Comedy, Romance]"


In [115]:
similar_movie(model1, 339)

title                                genres                    score
-----------------------------------  ---------------------  --------
While You Were Sleeping (1995)       ['Comedy', 'Romance']  1
My Best Friend's Wedding (1997)      ['Comedy', 'Romance']  0.59806
Sleepless in Seattle (1993)          ['Comedy', 'Romance']  0.586599
You've Got Mail (1998)               ['Comedy', 'Romance']  0.553851
Truth About Cats & Dogs, The (1996)  ['Comedy', 'Romance']  0.513513
Pretty Woman (1990)                  ['Comedy', 'Romance']  0.495504
Sabrina (1995)                       ['Comedy', 'Romance']  0.466085
French Kiss (1995)                   ['Comedy', 'Romance']  0.452017
I.Q. (1994)                          ['Comedy', 'Romance']  0.435554
Notting Hill (1999)                  ['Comedy', 'Romance']  0.426551


In [116]:
similar_movie(model2, 339)

title                           genres                       score
------------------------------  ------------------------  --------
While You Were Sleeping (1995)  ['Comedy', 'Romance']     1
Small Wonders (1996)            ['Documentary']           0.625034
For the Moment (1994)           ['Romance', 'War']        0.624525
Dangerous Ground (1997)         ['Drama']                 0.622973
Wings of Courage (1995)         ['Adventure', 'Romance']  0.622866
Run of the Country, The (1995)  ['Drama']                 0.622208
Two Much (1996)                 ['Comedy', 'Romance']     0.621274
Oxygen (1999)                   ['Thriller']              0.62079
Allnighter, The (1987)          ['Comedy', 'Romance']     0.620419
I Got the Hook Up (1998)        ['Comedy']                0.619964


* **Similar Movie 벡터 내적이 가장 클 때** param을 사용한 경우 상당히 유사한 item으로 보임
* **Watched Movie 벡터 내적이 가장 클 때** param을 사용한 경우 유사하지 않은 item으로 보임

---

**사용자에게 추천**

In [117]:
def recommend_movie(model, user_list = user_list, movie_list = movie_list):
    user = user_list[9999]
    
    movie = model.recommend(user, csr, filter_already_liked_items=True)
    
    movie_list = {v:k for k, v in movie_list.items()}
    
    movie = [(movie_list[i[0]], i[1]) for i in movie]
    movie = [(movies.loc[i[0]]['title'], movies.loc[i[0]]['genres'], i[1]) for i in movie]
    print(tabulate(movie, headers = ['title', 'genres', 'score']))

In [119]:
recommend_movie(model1)

title                                genres                                score
-----------------------------------  ---------------------------------  --------
Sleepless in Seattle (1993)          ['Comedy', 'Romance']              0.568672
My Best Friend's Wedding (1997)      ['Comedy', 'Romance']              0.478175
Ghost (1990)                         ['Comedy', 'Romance', 'Thriller']  0.386675
Truth About Cats & Dogs, The (1996)  ['Comedy', 'Romance']              0.268799
Sabrina (1995)                       ['Comedy', 'Romance']              0.267793
Forrest Gump (1994)                  ['Comedy', 'Romance', 'War']       0.255089
American President, The (1995)       ['Comedy', 'Drama', 'Romance']     0.24628
Four Weddings and a Funeral (1994)   ['Comedy', 'Romance']              0.227331
I.Q. (1994)                          ['Comedy', 'Romance']              0.206352
Tin Cup (1996)                       ['Comedy', 'Romance']              0.206255


In [120]:
recommend_movie(model2)

title                            genres                                            score
-------------------------------  --------------------------------------------  ---------
Sleepless in Seattle (1993)      ['Comedy', 'Romance']                         0.263924
My Best Friend's Wedding (1997)  ['Comedy', 'Romance']                         0.221962
Sabrina (1995)                   ['Comedy', 'Romance']                         0.168753
Forces of Nature (1999)          ['Comedy', 'Romance']                         0.125105
French Kiss (1995)               ['Comedy', 'Romance']                         0.123453
Sabrina (1954)                   ['Comedy', 'Romance']                         0.121365
Varsity Blues (1999)             ['Comedy', 'Drama']                           0.0948447
Bringing Up Baby (1938)          ['Comedy']                                    0.0934682
Charade (1963)                   ['Comedy', 'Mystery', 'Romance', 'Thriller']  0.0909141
Wedding Singer, The (1998) 

* **Similar Movie 벡터 내적이 가장 클 때** param을 사용한 경우와 **Watched Movie 벡터 내적이 가장 클 때** param을 사용한 경우 모두 적절한 추천을 하고 있음

---

**추천에 대한 기여도**

In [127]:
# 추천에 기여한 정도

def explain_movie(model, user_list = user_list, movie_list = movie_list):
    user = user_list[9999]
    movie = movie_list[1569]
    movie = model.explain(user, csr, itemid = movie)
    
    movie_list = {v:k for k, v in movie_list.items()}
    movie = [(movie_list[i[0]], i[1]) for i in movie[1]]
    movie = [(movies.loc[i[0]]['title'], movies.loc[i[0]]['genres'], i[1]) for i in movie]
    print(tabulate(movie, headers = ['title', 'genres', 'score']))

In [122]:
movies[movies.index.isin([1569])] # 추천하는 영화

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1569,My Best Friend's Wedding (1997),"[Comedy, Romance]"


In [128]:
explain_movie(model1)

title                           genres                      score
------------------------------  ---------------------  ----------
While You Were Sleeping (1995)  ['Comedy', 'Romance']  0.138776
Pretty Woman (1990)             ['Comedy', 'Romance']  0.129891
You've Got Mail (1998)          ['Comedy', 'Romance']  0.104205
Notting Hill (1999)             ['Comedy', 'Romance']  0.0919574
Roman Holiday (1953)            ['Comedy', 'Romance']  0.00956893


In [129]:
explain_movie(model2)

title                           genres                     score
------------------------------  ---------------------  ---------
Pretty Woman (1990)             ['Comedy', 'Romance']  0.0747074
While You Were Sleeping (1995)  ['Comedy', 'Romance']  0.0505609
You've Got Mail (1998)          ['Comedy', 'Romance']  0.0432872
Roman Holiday (1953)            ['Comedy', 'Romance']  0.0406036
Notting Hill (1999)             ['Comedy', 'Romance']  0.0120491


---

**회고**

**학습 결과 벡터 내적 수치** <br/>


Similar Movie 벡터 내적이 가장 클 때 (model1):<br/>
Watched Movie : 0.54699767<br/>
Similar Movie : 0.5120277<br/>


Watched Movie 벡터 내적이 가장 클 때 (model2):<br/>
Watched Movie : 0.881379<br/>
Similar Movie : 0.23739904<br/>

* 두 모델의 결과가 크게 다르리라 예상했지만 크게 다르지 않았습니다. 모두 제법 적절한 추천 결과를 낼 수 있었습니다. (추천한 항목 자체도 비슷합니다.)
* 추천 Score 및 추천에 대한 기여도 Score는 다소 차이가 있었습니다. 모델 1의 경우 상대적으로 Score 가 높고, 모델 2의 경우 상대적으로 Score가 낮습니다.
* 다만, 모델 2의 경우 비슷한 아이템을 찾는 성능이 떨어지기 때문에 모델 1이 더 우수한 모델인 것으로 생각됩니다.