In [1]:
import numpy as np
import scipy
import implicit

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


In [22]:
import os
import pandas as pd

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [23]:
#timestamp 제거
ratings=ratings.drop('timestamp',axis=1)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [24]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies['genre'] = movies['genre'].str.lower()
movies['title'] = movies['title'].str.lower()
movies.head(10)

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),animation|children's|comedy
1,2,jumanji (1995),adventure|children's|fantasy
2,3,grumpier old men (1995),comedy|romance
3,4,waiting to exhale (1995),comedy|drama
4,5,father of the bride part ii (1995),comedy
5,6,heat (1995),action|crime|thriller
6,7,sabrina (1995),comedy|romance
7,8,tom and huck (1995),adventure|children's
8,9,sudden death (1995),action
9,10,goldeneye (1995),action|adventure|thriller


In [7]:
movie_data = {v:k for k,v in zip(movies.movie_id,movies.title)}

# 분석
- ratings에 있는 유니크한 영화 개수
- ratings에 있는 유니크한 사용자 수
- 가장 인기 있는 영화 30개(인기순)

- 내가 선호하는 영화 5가지를 골라 ratings 추가
- CSR matrix
- als_model = AlternatingLeastSquares 모델
- 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악
- 내가 좋아하는 영화와 비슷한 영화를 추천
- 내가 가장 좋아할 만한 영화들을 추천

In [8]:
print("ratings에 있는 유니크한 영화 개수 : {}".format(ratings["movie_id"].nunique()))
print("ratings에 있는 유니크한 사용자 개수 : {}".format(ratings["user_id"].nunique()))

ratings에 있는 유니크한 영화 개수 : 3628
ratings에 있는 유니크한 사용자 개수 : 6039


In [9]:
movie_count = ratings.groupby('movie_id')['user_id'].count()
top_30 = movie_count.sort_values(ascending=False).head(30)

In [10]:
def get_movie_titles(list_of_idx):
    movie_titles = [title for title,idx in movie_data.items() if idx in list_of_idx]
    return movie_titles

In [11]:
def idx_from_movies(list_of_movies):
    idxs = [idx for title,idx in movie_data.items() if title in list_of_movies]
    return idxs

In [80]:
def get_idx(movie_title):
    idx = movie_data[movie_title]
    return idx

In [12]:
get_movie_titles(top_30.index.tolist())

['toy story (1995)',
 'braveheart (1995)',
 'star wars: episode iv - a new hope (1977)',
 'pulp fiction (1994)',
 'shawshank redemption, the (1994)',
 'forrest gump (1994)',
 'fugitive, the (1993)',
 'jurassic park (1993)',
 "schindler's list (1993)",
 'terminator 2: judgment day (1991)',
 'silence of the lambs, the (1991)',
 'fargo (1996)',
 'godfather, the (1972)',
 'e.t. the extra-terrestrial (1982)',
 'star wars: episode v - the empire strikes back (1980)',
 'princess bride, the (1987)',
 'raiders of the lost ark (1981)',
 'star wars: episode vi - return of the jedi (1983)',
 'terminator, the (1984)',
 'groundhog day (1993)',
 'back to the future (1985)',
 'men in black (1997)',
 'l.a. confidential (1997)',
 'saving private ryan (1998)',
 'shakespeare in love (1998)',
 'matrix, the (1999)',
 'ghostbusters (1984)',
 'sixth sense, the (1999)',
 'american beauty (1999)',
 'being john malkovich (1999)']

In [13]:
pd.set_option('display.max_rows', None)

In [14]:
movies[movies['title'].str.contains('(2000)')]

  movies[movies['title'].str.contains('(2000)')]


Unnamed: 0,movie_id,title,genre
1716,1772,blues brothers 2000 (1998),action|comedy|musical
3090,3159,fantasia 2000 (1999),animation|children's|musical
3121,3190,supernova (2000),adventure|sci-fi
3156,3225,down to you (2000),comedy|romance
3170,3239,isn't she great? (2000),comedy
3204,3273,scream 3 (2000),horror|mystery|thriller
3207,3276,gun shy (2000),comedy
3216,3285,"beach, the (2000)",adventure|drama
3217,3286,snow day (2000),comedy
3218,3287,"tigger movie, the (2000)",animation|children's


In [15]:
ratings['user_id'].unique()

array([   1,    2,    3, ..., 6038, 6039, 6040])

In [16]:
ratings.head()

Unnamed: 0,user_id,movie_id,ratings
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [34]:
my_favorite = ['road to el dorado, the (2000)','mission: impossible 2 (2000)','me, myself and irene (2000)','x-men (2000)','digimon: the movie (2000)']

In [35]:
my_movie_list = pd.DataFrame({'user_id':[6041]*5, 'movie_id':idx_from_movies(my_favorite), 'ratings':[5]*5})

In [38]:
my_movie_list

Unnamed: 0,user_id,movie_id,ratings
0,6041,3483,5
1,6041,3623,5
2,6041,3752,5
3,6041,3793,5
4,6041,3945,5


In [51]:
ratings_new= pd.concat([ratings,my_movie_list]).reset_index(drop=True)
ratings_new.tail(10)

Unnamed: 0,user_id,movie_id,ratings
836473,6040,1090,3
836474,6040,1094,5
836475,6040,562,5
836476,6040,1096,4
836477,6040,1097,4
836478,6041,3483,5
836479,6041,3623,5
836480,6041,3752,5
836481,6041,3793,5
836482,6041,3945,5


In [53]:
ratings_new.head()

Unnamed: 0,user_id,movie_id,ratings
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [54]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = ratings_new['user_id'].unique()
movie_unique = ratings_new['movie_id'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
artist_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [56]:
temp_user_data = ratings_new['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings_new):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings_new['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

user_id column indexing OK!!


In [57]:
temp_movie_data = ratings_new['movie_id'].map(artist_to_idx.get).dropna()
if len(temp_movie_data ) == len(ratings_new):
    print('artist column indexing OK!!')
    ratings_new['movie_id'] = temp_movie_data 
else:
    print('artist column indexing Fail!!')

artist column indexing OK!!


In [58]:
ratings_new.head()

Unnamed: 0,user_id,movie_id,ratings
0,0,0,5
1,0,1,3
2,0,2,3
3,0,3,4
4,0,4,5


In [59]:
from scipy.sparse import csr_matrix

num_user = ratings_new['user_id'].nunique()
num_movies = ratings_new['movie_id'].nunique()

csr_data = csr_matrix((ratings_new['ratings'], (ratings_new['user_id'], ratings_new['movie_id'])), shape=(num_user, num_movies))

csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [60]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [61]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [62]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [63]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [67]:
idx_from_movies(['road to el dorado, the (2000)'])

[3483]

In [82]:
me, eldorado = user_to_idx[6041], artist_to_idx[get_idx('road to el dorado, the (2000)')]
me_vector, eldorado_vector = als_model.user_factors[me], als_model.item_factors[eldorado]

In [83]:
me_vector

array([ 2.3460896e-01,  1.1041111e+00,  8.2746577e-01,  5.2757627e-01,
       -4.6262321e-01, -1.4321464e+00, -2.9302028e-01,  2.2658376e-01,
       -5.5951590e-04,  2.2191159e-01, -4.1577274e-01, -5.1479477e-01,
       -3.9311373e-01, -3.6066848e-01, -4.6597391e-01,  4.4064379e-01,
        4.5344165e-01,  1.0158134e+00, -4.4908836e-01, -7.1654111e-01,
       -4.5298573e-01,  1.3535665e-01,  4.6675292e-01,  3.7312874e-01,
        1.1530358e-01, -1.6507988e-01,  6.4449590e-01,  3.9640898e-01,
        5.0952441e-01, -9.0060838e-02, -1.6129574e-01, -8.3771598e-01,
       -4.6668842e-01, -1.5255754e-01,  1.7974883e-01, -4.0185851e-01,
        6.3540506e-01, -1.7187710e-01,  2.5966504e-01,  3.1459638e-01,
       -4.2970333e-01,  1.7562819e-01,  3.9766082e-01, -1.2235065e-01,
        3.8898516e-01, -3.2925591e-01,  4.8862556e-01,  4.3440458e-01,
        4.0891290e-01, -7.1111143e-01, -6.7927837e-01, -3.6553940e-01,
       -1.5276900e-01, -2.3152727e-01, -3.1850800e-01, -2.6298317e-01,
      

In [84]:
eldorado_vector

array([ 1.53198540e-02,  5.88499382e-03, -9.94628426e-05,  5.92638180e-03,
        5.02775190e-03, -9.65501741e-03,  9.95400827e-04,  1.06523018e-02,
        1.71310063e-02,  1.68839656e-02, -7.95116182e-03,  8.90953187e-03,
        7.49003608e-03,  2.14888109e-03,  1.83281815e-03,  1.76203959e-02,
        5.13926614e-03,  6.52964134e-03, -1.14679364e-02, -5.35393460e-03,
       -6.42577419e-03, -1.69002661e-03,  9.09090601e-03,  8.18113983e-03,
        2.89940182e-03,  1.66552737e-02,  9.10530984e-03,  8.68806895e-03,
        4.20434913e-03, -5.25319343e-03,  8.23072996e-03, -1.41393244e-02,
       -3.96049954e-03,  6.07798155e-03,  2.21268064e-03,  5.95942792e-03,
        1.09668514e-02, -1.65528210e-03,  1.67001486e-02,  5.34983352e-03,
        4.13113413e-03, -1.10714689e-04,  1.01398835e-02,  4.96262079e-03,
        1.05655724e-02,  2.18569580e-03,  5.79496473e-03,  1.85799180e-03,
        2.71318504e-03, -7.66322436e-03, -9.62828565e-03, -1.39925617e-03,
       -3.18664219e-03, -

In [85]:
np.dot(me_vector, eldorado_vector)

0.16739385

In [86]:
queen = artist_to_idx[get_idx('mission: impossible 2 (2000)')]
queen_vector = als_model.item_factors[queen]
np.dot(me_vector, queen_vector)

0.6080707

In [87]:
artist_id = artist_to_idx[get_idx('mission: impossible 2 (2000)')]
similar_artist = als_model.similar_items(artist_id, N=15)
similar_artist

[(482, 0.9999999),
 (327, 0.6472007),
 (67, 0.5746914),
 (903, 0.5503333),
 (1737, 0.53150445),
 (346, 0.5165638),
 (1537, 0.5047207),
 (633, 0.5027408),
 (463, 0.48598206),
 (792, 0.46748605),
 (372, 0.4673728),
 (746, 0.45676163),
 (782, 0.45395672),
 (1858, 0.43929908),
 (1031, 0.43451598)]

In [89]:
def get_similar_artist(movie_name: str):
    artist_id = artist_to_idx[get_idx(movie_name)]
    similar_artist = als_model.similar_items(artist_id)
    similar_artist = [idx_to_artist[i[0]] for i in similar_artist]
    return similar_artist

In [90]:
get_similar_artist('mission: impossible 2 (2000)')

[3623, 3717, 3578, 3744, 3452, 3753, 3555, 3593, 3510, 3798]

In [91]:
get_movie_titles(get_similar_artist('mission: impossible 2 (2000)'))

['romeo must die (2000)',
 'frequency (2000)',
 'u-571 (2000)',
 'gladiator (2000)',
 'battlefield earth (2000)',
 'mission: impossible 2 (2000)',
 'gone in 60 seconds (2000)',
 'shaft (2000)',
 'patriot, the (2000)',
 'what lies beneath (2000)']

In [92]:
user = user_to_idx[6041]
artist_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)

In [104]:
artist_recommended
rec = get_movie_titles([idx_to_artist[i[0]] for i in artist_recommended])
rec

['sleepy hollow (1999)',
 'scream 3 (2000)',
 'pitch black (2000)',
 'romeo must die (2000)',
 'frequency (2000)',
 'gladiator (2000)',
 'dinosaur (2000)',
 'road trip (2000)',
 'shanghai noon (2000)',
 'gone in 60 seconds (2000)',
 'shaft (2000)',
 'titan a.e. (2000)',
 'chicken run (2000)',
 'patriot, the (2000)',
 'perfect storm, the (2000)',
 'scary movie (2000)',
 'what lies beneath (2000)',
 'nutty professor ii: the klumps (2000)',
 'space cowboys (2000)',
 'cell, the (2000)']

In [97]:
rihanna = artist_to_idx[get_idx('scream 3 (2000)')]
explain = als_model.explain(user, csr_data, itemid=rihanna)

In [102]:
a = [(idx_to_artist[i[0]], i[1]) for i in explain[1]]
a

[(3752, 0.09008038870950245),
 (3623, 0.0789718829683423),
 (3793, 0.057724011239917654),
 (3483, 0.012569774012179215),
 (3945, 0.0009584823632744399)]

In [103]:
get_movie_titles([i[0] for i in a ])

['road to el dorado, the (2000)',
 'mission: impossible 2 (2000)',
 'me, myself and irene (2000)',
 'x-men (2000)',
 'digimon: the movie (2000)']

In [109]:
for i in rec :
    print(i, movies[movies['title']==i].genre)

sleepy hollow (1999) 3012    horror|romance
Name: genre, dtype: object
scream 3 (2000) 3204    horror|mystery|thriller
Name: genre, dtype: object
pitch black (2000) 3231    action|sci-fi
Name: genre, dtype: object
romeo must die (2000) 3383    action|romance
Name: genre, dtype: object
frequency (2000) 3441    drama|thriller
Name: genre, dtype: object
gladiator (2000) 3509    action|drama
Name: genre, dtype: object
dinosaur (2000) 3546    animation|children's
Name: genre, dtype: object
road trip (2000) 3548    comedy
Name: genre, dtype: object
shanghai noon (2000) 3555    action
Name: genre, dtype: object
gone in 60 seconds (2000) 3648    action|crime
Name: genre, dtype: object
shaft (2000) 3675    action|crime
Name: genre, dtype: object
titan a.e. (2000) 3676    adventure|animation|sci-fi
Name: genre, dtype: object
chicken run (2000) 3682    animation|children's|comedy
Name: genre, dtype: object
patriot, the (2000) 3684    action|drama|war
Name: genre, dtype: object
perfect storm, the 

# 회고
추천해준 것들을 보았을 때, 가장 큰 특징은 전부 2000년도에 분포되어있는 영화라는 것이다. 하지만, 내가 좋아하는 몇몇의 영화가 있는 것으로 보아 어느정도 추천을 잘 해주었다고 생각을 한다.