# 1. 데이터 가져오기

### (1) ratings.dat

In [1]:
import os
import pandas as pd

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
print(ratings.head())
print(len(ratings))

   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291
1000209


In [2]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)
ratings.head()

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    836478 non-null  int64
 1   movie_id   836478 non-null  int64
 2   count      836478 non-null  int64
 3   timestamp  836478 non-null  int64
dtypes: int64(4)
memory usage: 31.9 MB


In [5]:
# count 값 별 비율 확인

group_by_count = ratings.groupby('count')['user_id'].count()
print(group_by_count.sort_values(ascending=False))
group_by_count = group_by_count.to_dict()
group_by_count

total_num = len(ratings)
print("total : ", total_num)

for key, value in group_by_count.items():
    print(f'{key} proportion : {value/total_num :.2%}')

count
4    348971
3    261197
5    226310
Name: user_id, dtype: int64
total :  836478
3 proportion : 31.23%
4 proportion : 41.72%
5 proportion : 27.06%


### (2) movies.dat

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genre     3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


# 2. 데이터 살펴보기

- ratings에 있는 유니크한 영화 개수
- ratings에 있는 유니크한 사용자 수
- 가장 인기있는 영화 30개(인기순)


### (1) ratings에 존재하는 유니크한 영화 개수 확인

In [8]:
# ratings에 있는 유니크한 영화 개수

unique_movie_num = len(ratings['movie_id'].unique())
print("ratings file 내 유니크한 영화 개수 : ", unique_movie_num)

# ratings 파일의 unique한 영화 개수와 movies 파일의 영화 개수가 같은지 확인
print(unique_movie_num == len(movies))
print("movies file 내 유니크한 영화 개수 : ", len(movies))

ratings file 내 유니크한 영화 개수 :  3628
False
movies file 내 유니크한 영화 개수 :  3883


### (2) ratins에 존재하는 유니크한 사용자 수 확인

In [9]:
unique_user_num = len(ratings['user_id'].unique())
unique_user_num

6039

### (3) 가장 인기있는 영화 30개 추출

In [10]:
# 1) ratings / movies df 합치기
merged_list = pd.merge(ratings, movies, how='left')
merged_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 836477
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    836478 non-null  int64 
 1   movie_id   836478 non-null  int64 
 2   count      836478 non-null  int64 
 3   timestamp  836478 non-null  int64 
 4   title      836478 non-null  object
 5   genre      836478 non-null  object
dtypes: int64(4), object(2)
memory usage: 44.7+ MB


In [11]:
# 2) 인기있는 상위 30개 영화 추출 (누적 평점 기준)
total_ratings = merged_list.groupby("title")["count"].sum()
total_ratings.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   14449
Star Wars: Episode IV - A New Hope (1977)                13178
Star Wars: Episode V - The Empire Strikes Back (1980)    12648
Saving Private Ryan (1998)                               11348
Star Wars: Episode VI - Return of the Jedi (1983)        11303
Raiders of the Lost Ark (1981)                           11179
Silence of the Lambs, The (1991)                         11096
Matrix, The (1999)                                       10903
Sixth Sense, The (1999)                                  10703
Terminator 2: Judgment Day (1991)                        10513
Fargo (1996)                                             10465
Schindler's List (1993)                                  10317
Braveheart (1995)                                        10125
Shawshank Redemption, The (1994)                         10085
Back to the Future (1985)                                10081
Godfather, The (1972)                            

# 3. 내가 선호하는 영화 5가지를 골라서 ratings에 추가

### (1) 리스트 내 가장 선호하는 영화 5개 선정

In [12]:
# 평점 5점 받은 영화 제목 리스트 추출
len(set(merged_list[merged_list["count"].values==5]['title']))
set(merged_list[merged_list["count"].values==5]['title'])

{'Airport (1970)',
 'Alvarez Kelly (1966)',
 'Snow Falling on Cedars (1999)',
 'Big Trees, The (1952)',
 'Hunt for Red October, The (1990)',
 'Trial and Error (1997)',
 'Live Nude Girls (1995)',
 'Wing Commander (1999)',
 'American Beauty (1999)',
 'Wonder Boys (2000)',
 'Boys from Brazil, The (1978)',
 'Beyond the Mat (2000)',
 'Coneheads (1993)',
 'Confessional, The (Le Confessionnal) (1995)',
 'Yojimbo (1961)',
 'I Confess (1953)',
 'Homeward Bound: The Incredible Journey (1993)',
 'No Looking Back (1998)',
 'Madeline (1998)',
 'Species II (1998)',
 'Private Parts (1997)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Nighthawks (1981)',
 'Fabulous Baker Boys, The (1989)',
 'Red Sonja (1985)',
 'Shine (1996)',
 'Clean Slate (Coup de Torchon) (1981)',
 'Paradise Lost: The Child Murders at Robin Hood Hills (1996)',
 'Jungle Fever (1991)',
 'Loser (2000)',
 'Cool Dry Place, A (1998)',
 '42 Up (1998)',
 'Rocketship X-M (1950)',
 'Before and After (1996)',
 'Enemy of the State (1998)',

In [13]:
# 2000년 이후 개봉한 영화 추출
titles = merged_list["title"].tolist()
opening_years  = set([x[-5:-1] for x in titles])
print(sorted(opening_years))
merged_list['opening_years'] = pd.to_numeric([x[-5:-1] for x in titles])
merged_list.head()
merged_list.info()
set(merged_list[merged_list["opening_years"].values==2000]['title'])

['1919', '1920', '1921', '1922', '1923', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 836477
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   user_id        836478 non-null  int64 
 1   movie_id       836478 non-null  int64 
 2   count          836478 non-null  int64 
 3   timestamp      8

{'28 Days (2000)',
 '3 Strikes (2000)',
 'About Adam (2000)',
 'Adventures of Rocky and Bullwinkle, The (2000)',
 'Almost Famous (2000)',
 'American Psycho (2000)',
 'Anatomy (Anatomie) (2000)',
 'Art of War, The (2000)',
 'Autumn in New York (2000)',
 'Back Stage (2000)',
 'Bait (2000)',
 "Ballad of Ramblin' Jack, The (2000)",
 'Bamboozled (2000)',
 'Battlefield Earth (2000)',
 'Beach, The (2000)',
 'Beautiful (2000)',
 'Best in Show (2000)',
 'Beyond the Mat (2000)',
 'Big Kahuna, The (2000)',
 "Big Momma's House (2000)",
 'Bittersweet Motel (2000)',
 'Bless the Child (2000)',
 'Boiler Room (2000)',
 'Bootmen (2000)',
 'Boys and Girls (2000)',
 'Bring It On (2000)',
 'Broken Hearts Club, The (2000)',
 'Butterfly (La Lengua de las Mariposas) (2000)',
 'Catfish in Black Bean Sauce (2000)',
 'Cecil B. Demented (2000)',
 'Cell, The (2000)',
 'Center Stage (2000)',
 'Chain of Fools (2000)',
 'Chicken Run (2000)',
 'Chuck & Buck (2000)',
 'Circus (2000)',
 'Closer You Get, The (2000)',
 'C

### (2) 선정한 영화 movie_id 추출

In [14]:
# 내가 좋아하는 영화 5개 movie id 구하기 : man in black / Titanic / Bring it on / Sixth Sense / jumanji

movies['title'] = movies['title'].str.lower()
favorite_list = movies[movies['title'].str.contains("jumanji|bring it on|titanic|sixth sense|men in black")]
print(favorite_list)
favorite_movie_id = [2, 1721, 2762, 1580, 3882]

      movie_id                                   title  \
1            2                          jumanji (1995)   
1539      1580                     men in black (1997)   
1672      1721                          titanic (1997)   
2088      2157  chambermaid on the titanic, the (1998)   
2693      2762                 sixth sense, the (1999)   
3334      3403                raise the titanic (1980)   
3335      3404                          titanic (1953)   
3812      3882                      bring it on (2000)   

                               genre  
1       Adventure|Children's|Fantasy  
1539  Action|Adventure|Comedy|Sci-Fi  
1672                   Drama|Romance  
2088                         Romance  
2693                        Thriller  
3334                  Drama|Thriller  
3335                    Action|Drama  
3812                          Comedy  


In [15]:
# 선정한 영화가 ratings리스트에도 존재하는지 확인 => 해당과정 필요 X because merged_list 자체가 ratins df를 바탕으로 생성한 Df이기 때문

ratings_movie_id = ratings['movie_id'].tolist()
ratings_movie_id

for i in favorite_movie_id:
    if i in ratings_movie_id:
        print(i, "checked")
    else :
        print("no exist", i)

2 checked
1721 checked
2762 checked
1580 checked
3882 checked


### (3) ratings에 리스트 추가

In [16]:
# ratings 에 데이터 추가하기 전, 형태 및 데이터 확인

ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    836478 non-null  int64
 1   movie_id   836478 non-null  int64
 2   count      836478 non-null  int64
 3   timestamp  836478 non-null  int64
dtypes: int64(4)
memory usage: 31.9 MB


In [17]:
ratings.tail()

Unnamed: 0,user_id,movie_id,count,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569


In [18]:
unique_user_num

6039

In [19]:
# timestampt 칼럼은 무의미한 칼럼으로 drop

ratings = ratings.drop('timestamp', axis=1)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 1000208
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   836478 non-null  int64
 1   movie_id  836478 non-null  int64
 2   count     836478 non-null  int64
dtypes: int64(3)
memory usage: 25.5 MB


In [20]:
# ratings에 값 추가

my_favorites = pd.DataFrame({"user_id":[10000]*5, "movie_id":favorite_movie_id, "count":[5]*5})

if not ratings.isin({"user_id" : [10000]})["user_id"].any():
    ratings = ratings.append(my_favorites, ignore_index=True)
    
ratings.tail(10)

Unnamed: 0,user_id,movie_id,count
836473,6040,1090,3
836474,6040,1094,5
836475,6040,562,5
836476,6040,1096,4
836477,6040,1097,4
836478,10000,2,5
836479,10000,1721,5
836480,10000,2762,5
836481,10000,1580,5
836482,10000,3882,5


# 4. CSR matrix 만들기

In [21]:
# CSR Matrix 생성
from scipy.sparse import csr_matrix

# 유니크한 유저수와 영화수 재할당
unique_user_num = ratings['user_id'].nunique()
unique_movie_num = ratings['movie_id'].nunique()
print("num of users :", unique_user_num)
print("num of movies :",unique_movie_num)
print(type(unique_user_num))
print(type(unique_movie_num))


num of users : 6040
num of movies : 3628
<class 'int'>
<class 'int'>


In [22]:
ratings.count()

user_id     836483
movie_id    836483
count       836483
dtype: int64

In [23]:
csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)))
csr_data

<10001x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

**[ Troubleshooting ]**

1) <font color='red'> TypeError: len() of unsized object</font> 
> csr_data = csr_matrix((ratings.count, (ratings.user_id, ratings.movie_id)), shape = (unique_user_num, unique_movie_num))  

- cause : 컬럼명으로 사용된 count 거 예약어이기 때문에 의도대로 원하는 값을 불러올 수가 없음
- solution : ratings['count']

2) <font color='red'>  ValueError: row index exceeds matrix dimensions </font> 
> csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)), shape = (unique_user_num, unique_movie_num))
csr_data  

- cause : shape 구문에서 규정한 사이즈와 실제 데이터 사이즈가 다름에 따라 발생하는 에러
- solution : shape 이하 구문을 생략해서 본 데이터 사이즈에 맞춰 자동 생성되게 한다.

In [24]:
print(ratings['movie_id'].nunique())
print(ratings['movie_id'].max())
print(movies['movie_id'].nunique())
print(movies['movie_id'].max())

3628
3952
3883
3952


# 5. als_model = AlternatingLeastSquares 모델을 구성하여 Training

In [25]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

**[ AlternatingLeastSquares 클래스의 __init__ 파라미터 ]**

1. factors : 유저와 아이템의 벡터를 몇 차원으로 할 것인지
2. regularization : 과적합을 방지하기 위해 정규화 값을 얼마나 사용할 것인지
3. use_gpu : GPU를 사용할 것인지
4. iterations : epochs와 같은 의미, 데이터를 몇 번 반복해서 학습할 것인지 (iterations 수를 늘릴수록 학습데이터를 잘 학습하게 되지만 과적합의 우려가 있어 좋은 값을 찾아야 함)

In [26]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [27]:
# als 모델의 input 값으로 csr_data matrix transformation
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x10001 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [28]:
# 모델 훈련
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




# 6. 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도 파악하기

In [29]:
# 추가한 데이터 확인
ratings.tail

<bound method NDFrame.tail of         user_id  movie_id  count
0             1      1193      5
1             1       661      3
2             1       914      3
3             1      3408      4
4             1      2355      5
...         ...       ...    ...
836478    10000         2      5
836479    10000      1721      5
836480    10000      2762      5
836481    10000      1580      5
836482    10000      3882      5

[836483 rows x 3 columns]>

In [30]:
# 츄가한 영화 제목 확인 
for i in favorite_movie_id:
    print(movies[movies['movie_id']==i])

   movie_id           title                         genre
1         2  jumanji (1995)  Adventure|Children's|Fantasy
      movie_id           title          genre
1672      1721  titanic (1997)  Drama|Romance
      movie_id                    title     genre
2693      2762  sixth sense, the (1999)  Thriller
      movie_id                title                           genre
1539      1580  men in black (1997)  Action|Adventure|Comedy|Sci-Fi
      movie_id               title   genre
3812      3882  bring it on (2000)  Comedy


### (1) 내가 선택한 영화 ' 타이타닉'과 나의 선호도 파악

In [31]:
# 추가한 '10000' 유저의 백터와 'titanic'의 백터를 어떻게 만들고 있는지 확인하기

user10000_vector, titanic_vector = als_model.user_factors[10000], als_model.item_factors[1721]
print("완료")

완료


In [32]:
print(user10000_vector.shape)
user10000_vector

(100,)


array([-0.15485366,  0.00587152, -0.527123  , -0.66379166,  0.6365566 ,
       -0.16215523,  0.34416056, -0.35515732,  1.3545563 ,  0.24910602,
       -0.02668373, -0.7115441 , -1.0106754 , -0.00318021, -0.39392155,
       -0.7360834 , -0.3134013 ,  0.30116406,  0.6826009 , -0.20856261,
        0.34526923, -0.36023992,  0.81499195, -0.07856327, -0.2535058 ,
       -0.40413114, -0.07197443,  0.158241  ,  0.2817828 ,  0.27024058,
       -0.21986106, -0.2320144 , -0.9004859 , -0.35671777, -0.04379715,
        0.07591894,  0.43962854,  0.6838561 ,  0.22825454, -0.24114673,
       -0.7585877 , -0.28593627,  0.29900292,  0.5088127 ,  0.30690515,
        0.83659744,  0.4926428 , -0.78909063,  0.1498785 ,  0.05585463,
        0.5789976 ,  0.13129555,  0.2241135 , -0.49097627,  0.5603697 ,
        0.8538803 ,  0.38715824,  0.41527092,  0.18671939, -0.7569087 ,
        0.09296806, -0.6139018 , -0.88182575,  0.20039661,  0.4136079 ,
        0.88456476, -0.58233494,  0.31031987, -0.15983468,  0.46

In [33]:
print(titanic_vector.shape)
titanic_vector

(100,)


array([ 0.03646321,  0.03637603,  0.00936662, -0.01892317,  0.03389367,
        0.00387308,  0.01507   ,  0.01957535,  0.01750313,  0.02221978,
        0.00398397, -0.01316047, -0.04257184,  0.03105672,  0.00168813,
       -0.03272793,  0.00345956, -0.0125855 ,  0.01827706, -0.02290587,
        0.01177529,  0.03004378,  0.02193276, -0.00084914,  0.00450119,
       -0.03652236, -0.01561611,  0.01770765,  0.00975609, -0.01887921,
       -0.00658988, -0.00531659, -0.03057518, -0.01505802, -0.00977424,
        0.01963142,  0.01584573, -0.00047986,  0.03868895,  0.00998276,
       -0.00317447, -0.00460015,  0.01520628,  0.02522903,  0.01897725,
        0.02250272,  0.02865112,  0.00083656, -0.02564187, -0.03423566,
        0.00928839,  0.03631769,  0.02259902, -0.01815248, -0.0027781 ,
        0.0030371 , -0.00705656,  0.02980572,  0.01530501, -0.0162662 ,
        0.00480701,  0.02880544, -0.02809064,  0.00098505,  0.05037006,
        0.01519999, -0.02892681,  0.01639227,  0.0040879 , -0.00

In [34]:
# 10000번 유저의 타이타닉에 대한 선호도 확인
np.dot(user10000_vector, titanic_vector)

0.6052655

### (2) 내가 선택하지 않은 영화 '매트릭스'와 나의 선호도 파악

In [35]:
matrix_movieid = int(movies[movies['title'].str.contains('matrix')]["movie_id"])
matrix_vector = als_model.item_factors[matrix_movieid]
print(matrix_vector.shape) # matrix_movieid를 int 로 감싸줘야만 (100,) 형태 가짐
print(matrix_vector)
np.dot(user10000_vector, matrix_vector)

(100,)
[-0.00011935 -0.00302787 -0.00120674  0.01207919  0.00432333 -0.00044214
  0.01131729  0.02860696  0.05224355  0.02287759  0.03382428 -0.00220617
  0.02036303  0.00892749  0.00900753  0.01866359 -0.00849398  0.02873229
  0.0174605   0.00995183 -0.00762659  0.00900482 -0.00317955  0.0181101
  0.01040156 -0.00364809  0.00287554  0.00593378  0.01670445  0.02817294
  0.00744762  0.03833121 -0.03045356  0.01501226  0.00399758 -0.00809878
  0.01163408  0.02822091 -0.02250703 -0.00806589 -0.00135909 -0.00134379
 -0.00686108  0.00290649 -0.01754591  0.01755019  0.02211455 -0.01802709
  0.00822004 -0.0118272   0.01481293 -0.00019505  0.05074043  0.01231276
  0.031034    0.00461756  0.00881868  0.01128515 -0.00688708 -0.00206497
  0.00066613 -0.00062054  0.03089213  0.02506819 -0.00843355  0.02633294
 -0.01396126 -0.00420864 -0.04542542  0.01481231 -0.01389618 -0.00240127
 -0.01445806 -0.02662243  0.02797407 -0.00577017 -0.0008619   0.00515603
 -0.00254572 -0.00202115  0.01070411  0.00972

0.27675202

# 7. 내가 좋아하는 영화와 비슷한 영화를 추천받기

In [36]:
# 함수화 하기

def get_similar_movie(movie_id):
    
    similar_movieid = als_model.similar_items(movie_id)    
    
    similar_movies = []
    pereference = []
    for i in similar_movieid:
        
        movie_name = str(movies[movies['movie_id'] == i[0]]['title'].values)[2:-2]
        similar_movies.append(movie_name)
        pereference.append(f'{i[1]:.2%}')
       
    return dict(zip(similar_movies, pereference))

In [37]:
for i in favorite_movie_id:
    print(get_similar_movie(i))
    print("------------------------------------")

{'jumanji (1995)': '100.00%', 'hook (1991)': '82.10%', 'indian in the cupboard, the (1995)': '79.84%', 'dragonheart (1996)': '75.80%', 'space jam (1996)': '67.42%', 'flubber (1997)': '66.66%', 'borrowers, the (1997)': '65.96%', 'neverending story ii: the next chapter, the (1990)': '64.99%', 'small soldiers (1998)': '63.42%', 'santa clause, the (1994)': '62.96%'}
------------------------------------
{'titanic (1997)': '100.00%', 'jerry maguire (1996)': '49.29%', "mr. holland's opus (1995)": '45.64%', 'held up (2000)': '42.82%', 'ever after: a cinderella story (1998)': '41.44%', "you've got mail (1998)": '39.87%', 'dirty dancing (1987)': '38.14%', 'truman show, the (1998)': '38.08%', 'city of angels (1998)': '37.71%', 'bridges of madison county, the (1995)': '36.44%'}
------------------------------------
{'sixth sense, the (1999)': '100.00%', 'silence of the lambs, the (1991)': '45.82%', 'usual suspects, the (1995)': '45.28%', 'fight club (1999)': '40.64%', 'being john malkovich (1999)':

# 8. 내가 가장 좋아할 만한 영화들을 추천받기

In [38]:
# AlternatingLeastSquares 클래스에 구현되어 있는 recommend 메서드를 활용하여 아티스트 추천
# recommend에서는 user*item CSR Matrix를 받습니다.

movie_recommended = als_model.recommend(10000, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(2628, 0.52016777),
 (480, 0.47111142),
 (589, 0.31567892),
 (593, 0.29165736),
 (2571, 0.276752),
 (648, 0.26648262),
 (3175, 0.25654557),
 (2858, 0.23456624),
 (356, 0.22791441),
 (1544, 0.21054552),
 (2916, 0.20990393),
 (3948, 0.20312671),
 (780, 0.20115575),
 (1961, 0.19857265),
 (3101, 0.19378042),
 (3543, 0.18637577),
 (1393, 0.1853965),
 (3489, 0.1827113),
 (920, 0.18181887),
 (367, 0.18107828)]

In [39]:
[str(movies[movies['movie_id'] == i[0]]['title'].values) for i in movie_recommended]

["['star wars: episode i - the phantom menace (1999)']",
 "['jurassic park (1993)']",
 "['terminator 2: judgment day (1991)']",
 "['silence of the lambs, the (1991)']",
 "['matrix, the (1999)']",
 "['mission: impossible (1996)']",
 "['galaxy quest (1999)']",
 "['american beauty (1999)']",
 "['forrest gump (1994)']",
 "['lost world: jurassic park, the (1997)']",
 "['total recall (1990)']",
 "['meet the parents (2000)']",
 "['independence day (id4) (1996)']",
 "['rain man (1988)']",
 "['fatal attraction (1987)']",
 "['diner (1982)']",
 "['jerry maguire (1996)']",
 "['hook (1991)']",
 "['gone with the wind (1939)']",
 "['mask, the (1994)']"]