In [56]:
import pandas as pd

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.display.max_rows=150
%matplotlib inline
import os

In [68]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [70]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [71]:
# rating 컬럼의 이름을 play_count로 바꿉니다.
ratings.rename(columns={'rating':'play_cnt'}, inplace=True)
ratings.head()

Unnamed: 0,user_id,movie_id,play_cnt,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [72]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [73]:
ml=pd.merge(ratings,movies, on='movie_id')
ml.head()

Unnamed: 0,user_id,movie_id,play_cnt,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


2) 분석해 봅시다.

In [63]:
# ratings에 있는 유니크한 영화 개수
ml['movie_id'].nunique()

3628

In [64]:
# rating에 있는 유니크한 사용자 수
ml['user_id'].nunique()

6039

In [75]:
# 가장 인기있는 영화 30개(인기순)
top_viewed = ml.pivot_table(index=['title','movie_id'], values='play_cnt', aggfunc=np.sum)
top_viewed = top_viewed.sort_values(by='play_cnt',ascending=False)
top_viewed.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,play_cnt
title,movie_id,Unnamed: 2_level_1
American Beauty (1999),2858,14449
Star Wars: Episode IV - A New Hope (1977),260,13178
Star Wars: Episode V - The Empire Strikes Back (1980),1196,12648
Saving Private Ryan (1998),2028,11348
Star Wars: Episode VI - Return of the Jedi (1983),1210,11303
Raiders of the Lost Ark (1981),1198,11179
"Silence of the Lambs, The (1991)",593,11096
"Matrix, The (1999)",2571,10903
"Sixth Sense, The (1999)",2762,10703
Terminator 2: Judgment Day (1991),589,10513


3) 내가 선호하는 영화를 5가지 골라서 rating에 추가해 줍시다.

In [80]:
my_favorates =[356, 110, 2571, 2028, 858 ]
my_record = pd.DataFrame({'user_id':['ilkyu']*5, 'movie_id':my_favorates, 'play_cnt':[5]*5})
my_record

Unnamed: 0,user_id,movie_id,play_cnt
0,ilkyu,356,5
1,ilkyu,110,5
2,ilkyu,2571,5
3,ilkyu,2028,5
4,ilkyu,858,5


In [82]:
if not ratings.isin({'user_id':['ilkyu']})['user_id'].any():
    ratings = ratings.append(my_record)

ratings[ratings['user_id']=='ilkyu']

Unnamed: 0,user_id,movie_id,play_cnt,timestamp
0,ilkyu,356,5,
1,ilkyu,110,5,
2,ilkyu,2571,5,
3,ilkyu,2028,5,
4,ilkyu,858,5,


In [92]:
mls=pd.merge(ratings,movies, on='movie_id')
mls.head(3)

Unnamed: 0,user_id,movie_id,play_cnt,timestamp,title,genre
0,1,1193,5,978300760.0,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413.0,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179.0,One Flew Over the Cuckoo's Nest (1975),Drama


In [93]:
mls=mls.drop(['timestamp'], axis=1)

4) CSR matrix를 직접 만들어 봅시다.

In [94]:
# ratings의 user_id, movie_id 결측치 처리. 
# CSR matrix 생성시 'row index exceeds matrix dimensions' 오류 방지위해 user_id, movie_id 초기화 후, 재할당.

user_idx_to_unique = {v: k for k, v in enumerate(mls.user_id.unique())}
movie_idx_to_unique = {v: k for k, v in enumerate(mls.movie_id.unique())}

In [95]:
temp_user_data = mls.user_id.map(user_idx_to_unique.get).dropna()
temp_movie_data = mls.movie_id.map(movie_idx_to_unique.get).dropna()

In [96]:
mls.user_id = temp_user_data
mls.movie_id = temp_movie_data

In [98]:
from scipy.sparse import csr_matrix
num_user = mls['user_id'].nunique()
num_movie = mls['movie_id'].nunique()

csr_data = csr_matrix((mls.play_cnt, (mls.user_id, mls.movie_id)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [99]:
# MF model 생성

from implicit.als import AlternatingLeastSquares

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

5) als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

In [100]:
# Implicit AlternatingLeastSquares 모델 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

# 모델 훈련
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




6) 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.

In [114]:
# 선호하는 영화 ' Forrest Gump (1994)'  
ilkyu, favorates_160 = user_idx_to_unique['ilkyu'], 160
ilkyu_vector, favorates_160_vector = als_model.user_factors[ilkyu], als_model.item_factors[favorates_160]

In [116]:
ilkyu_vector

array([ 0.20863347,  0.2802035 , -1.0161401 , -0.08541259,  0.81470644,
        0.45199522, -0.33522764,  0.44561952,  0.13589475, -0.3178268 ,
        1.7671733 , -0.76173675, -0.08730287, -0.16226202,  0.27019256,
        0.31555557, -0.62074864, -0.9869566 ,  0.944152  ,  0.53811586,
       -0.3960776 ,  0.09238323,  1.0280362 , -0.613276  , -0.57270044,
       -0.20395637,  0.1683776 , -0.13416748, -0.534179  , -0.47485754,
       -1.0864431 ,  0.4661131 ,  0.19719566,  0.31928596, -0.57173055,
        0.62151146, -0.74985456,  0.27592328, -0.08653997,  0.3052086 ,
       -0.17253044, -0.03486191, -0.23679726,  0.24008466,  0.43358198,
        0.42353204,  0.74977416,  0.0101178 ,  0.83291596, -0.36754814,
        0.14463279,  0.06699761, -0.42036882,  0.37525332,  1.013756  ,
       -0.3729454 , -0.36072636,  0.37348765, -0.11506078, -0.09336243,
       -0.533231  ,  0.20738935, -0.04161833,  0.11858159, -0.3935385 ,
       -0.42351988,  1.2731065 ,  1.5188714 , -0.42300233, -0.66

In [117]:
favorates_160_vector

array([ 0.01317676,  0.01230152, -0.01234516,  0.02977359, -0.01722229,
       -0.01419672, -0.01498077,  0.00253268,  0.01781275, -0.01195354,
        0.0580948 ,  0.00563855, -0.01842186,  0.01167453,  0.01438227,
        0.01087544, -0.02496927, -0.0260774 ,  0.02079916,  0.00555574,
       -0.02618838,  0.02139116,  0.05151825,  0.01271356, -0.00734749,
        0.01316839,  0.02776296, -0.00190621,  0.00346233, -0.0144559 ,
        0.00750928,  0.01004755, -0.00156232,  0.01955355,  0.0251833 ,
        0.0087739 , -0.01572334,  0.01332949,  0.02441956,  0.03028074,
        0.03207555,  0.01247574, -0.02328686,  0.00247509,  0.01705403,
        0.01665779,  0.04335482,  0.0136705 ,  0.01582545, -0.01161457,
        0.03336188,  0.04662972, -0.02486606,  0.01969924,  0.04364502,
        0.00213436,  0.03278673,  0.00423788,  0.00464227, -0.02392639,
       -0.04568443,  0.00574954, -0.01256078, -0.0030646 ,  0.04159946,
        0.00715694,  0.0406715 ,  0.01887739,  0.03681822, -0.01

In [118]:
# 내 영화에 대한 모델의 예측 선호도
np.dot(ilkyu_vector, favorates_160_vector)

0.5701579

7) 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

In [126]:
favorite_movie = 'Forrest Gump (1994)'
movie_id = list(mls[mls['title'] == 'Forrest Gump (1994)']['movie_id'].unique())[0]
similar_movie = als_model.similar_items(movie_id, N=5)
similar_movie

[(160, 1.0000001),
 (110, 0.5964449),
 (508, 0.48597613),
 (336, 0.47566465),
 (154, 0.47361597)]

In [127]:
# 추천받은 movie_id 와 title 매칭하기
movie_id_rcmd = []
for i in range(len(similar_movie)):
    id_num = similar_movie[i][0]
    movie_id_rcmd.append(id_num)

movie_titles = []
for i in movie_id_rcmd:
    movie_name = list(mls[mls['movie_id'] == i]['title'].unique())
    movie_titles.append(movie_name)
movie_titles

[['Forrest Gump (1994)'],
 ['Groundhog Day (1993)'],
 ['Ghost (1990)'],
 ['Pretty Woman (1990)'],
 ['As Good As It Gets (1997)']]

8) 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.

In [130]:
user = user_idx_to_unique['ilkyu']

# recommend에서는 user*item CSR Matrix를 받습니다.
movie_picked = als_model.recommend(user, csr_data, N=10, filter_already_liked_items=True)
movie_picked

[(380, 0.5330708),
 (23, 0.51316655),
 (92, 0.4435858),
 (141, 0.37461305),
 (487, 0.33929136),
 (64, 0.33928803),
 (121, 0.3328874),
 (157, 0.3305613),
 (99, 0.31954587),
 (44, 0.31853843)]

In [132]:
# 추천받은 movie_id 와 title 매칭하기
movie_id_rcmd = []
for i in range(len(movie_picked)):
    id_num = movie_picked[i][0]
    movie_id_rcmd.append(id_num)

movie_titles = []
for i in movie_id_rcmd:
    movie_name = list(mls[mls['movie_id'] == i]['title'].unique())
    movie_titles.append(movie_name)
movie_titles

[['Godfather: Part II, The (1974)'],
 ["Schindler's List (1993)"],
 ['Terminator 2: Judgment Day (1991)'],
 ['Fugitive, The (1993)'],
 ['Boat, The (Das Boot) (1981)'],
 ['Star Wars: Episode VI - Return of the Jedi (1983)'],
 ['Silence of the Lambs, The (1991)'],
 ['Shawshank Redemption, The (1994)'],
 ['American Beauty (1999)'],
 ['Star Wars: Episode IV - A New Hope (1977)']]