In [70]:
import numpy as np
import scipy
import implicit
import pandas as pd
from scipy.sparse import csr_matrix
print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.23.5
1.9.1
0.6.2


# 파일 불러오기
* 별점 데이터는 explicit 데이터이지만 implicit으로 간주하고 진행
* 별점을 시청횟수로 해석해서 진행
* 유저가 3점 미만으로 준 데이터는 선호하지 않는다고 가정하고 제외 후 진행

In [71]:
rating_file_path = './data/recommendation/movie/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 데이터 준비와 전처리

In [72]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [73]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [74]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path='./data/recommendation/movie/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [75]:
# 고유한 유저, 영화를 찾아내는 코드
user_unique = ratings['user_id'].unique()
movie_unique = movies['movie_id'].unique()

# ratings에 있는 유니크한 사용자 수
print('유니크한 사용자 수', len(user_unique))
# ratings에 있는 유니크한 영화 개수
print('유니크한 영화 수', len(movie_unique))
# 가장 인기있는 영화 30개(인기순)
print('Top 30 영화 : ')
display(pd.merge(ratings.groupby('movie_id')['user_id'].count(), movies, on='movie_id').\
    rename(columns={'user_id':'count'}).sort_values('count', ascending=False).head(30))

유니크한 사용자 수 6039
유니크한 영화 수 3883
Top 30 영화 : 


Unnamed: 0,movie_id,count,title,genre
2600,2858,3211,American Beauty (1999),Comedy|Drama
249,260,2910,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
1080,1196,2885,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1094,1210,2716,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
1810,2028,2561,Saving Private Ryan (1998),Action|Drama|War
569,589,2509,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
573,593,2498,"Silence of the Lambs, The (1991)",Drama|Thriller
1082,1198,2473,Raiders of the Lost Ark (1981),Action|Adventure
1152,1270,2460,Back to the Future (1985),Comedy|Sci-Fi
2325,2571,2434,"Matrix, The (1999)",Action|Sci-Fi|Thriller


In [76]:
ratings = pd.merge(ratings, movies, on='movie_id').drop(['timestamp','genre'], axis=1)
ratings

Unnamed: 0,user_id,movie_id,counts,title
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975)
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975)
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975)
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975)
...,...,...,...,...
836473,5851,3607,5,One Little Indian (1973)
836474,5854,3026,4,Slaughterhouse (1987)
836475,5854,690,3,"Promise, The (Versprechen, Das) (1994)"
836476,5938,2909,4,"Five Wives, Three Secretaries and Me (1998)"


## user, 영화 추가

In [77]:
my_favorite = [3070, 187, 29, 1666, 3921] # 5개 랜덤추출

# 새로운 유저 추가, 이 유저가 위 영화를 counts만큼 봤다고 하겠습니다
my_playlist = pd.DataFrame({'user_id': [6041]*5, 'movie_id': my_favorite,
                            'counts':[3,4,5,4,3], 'title': movies[movies['movie_id'].isin(my_favorite)]['title']})

if not ratings.isin({'user_id': [6041]})['user_id'].any():  # user_id에 'jj'가 없다면
    ratings = pd.concat([ratings, my_playlist])
ratings              

Unnamed: 0,user_id,movie_id,counts,title
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975)
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975)
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975)
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975)
...,...,...,...,...
28,6041,3070,3,"City of Lost Children, The (1995)"
185,6041,187,4,Party Girl (1995)
1620,6041,29,5,Hugo Pool (1997)
3001,6041,1666,4,Adventures of Buckaroo Bonzai Across the 8th D...


In [78]:
user_unique = ratings['user_id'].unique()
movie_unique = movies['movie_id'].unique()
# 유저, 영화 id indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
idx_to_user = {v:k for k,v in user_to_idx.items()}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}
idx_to_movie = {v:k for k,v in movie_to_idx.items()}

In [79]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(movie_unique) == len(movies):
    print('movie column indexing OK!!')    
    ratings['movie_id'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

ratings

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,title
0,0,1176,5,One Flew Over the Cuckoo's Nest (1975)
1,1,1176,5,One Flew Over the Cuckoo's Nest (1975)
2,2,1176,4,One Flew Over the Cuckoo's Nest (1975)
3,3,1176,4,One Flew Over the Cuckoo's Nest (1975)
4,4,1176,5,One Flew Over the Cuckoo's Nest (1975)
...,...,...,...,...
28,6039,3001,3,"City of Lost Children, The (1995)"
185,6039,185,4,Party Girl (1995)
1620,6039,28,5,Hugo Pool (1997)
3001,6039,1620,4,Adventures of Buckaroo Bonzai Across the 8th D...


## csr matrix

원래의 user-item matrix는 매우 sparse 하므로 csr_matrix화 해준다

In [80]:
# matrix = pd.pivot_table(ratings, index='user_id', columns='movie_id', values='counts').fillna(0)
# matrix

In [81]:
# from scipy.sparse import csr_matrix

# csr_matrix = csr_matrix(matrix.astype(pd.SparseDtype("float64",0)).sparse.to_coo())

csr_matrix 활용

In [82]:
num_user = ratings['user_id'].nunique()
num_movie = movies['movie_id'].nunique()

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<6040x3883 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [83]:
from implicit.als import AlternatingLeastSquares
import os

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float32)

In [84]:
alpha_val = 40 #count에 대한 중요도를 더 높히기 위해
data_conf = (csr_data * alpha_val).astype('double')
als_model.fit(data_conf)

100%|██████████| 30/30 [00:19<00:00,  1.53it/s]


## 결과

### my_favorite 영화들에 대한 선호도

In [103]:
user1 = user_to_idx[6041]
for i in range(len(my_favorite)):
    item1 = movie_to_idx[my_favorite[i]]
    m = movies[movies['movie_id']==idx_to_movie[item1]]['title'].iloc[0]
    user1_vector, item1_vector = als_model.user_factors[user1], als_model.item_factors[item1]
    print(f'좋아하는 영화{i} {m[:10]}.. 에 대한 선호도: {np.dot(user1_vector, item1_vector)}')

좋아하는 영화0 Adventures.. 에 대한 선호도: 0.8968951106071472
좋아하는 영화1 Party Girl.. 에 대한 선호도: 0.8903246521949768
좋아하는 영화2 City of Lo.. 에 대한 선호도: 0.9550532102584839
좋아하는 영화3 Hugo Pool .. 에 대한 선호도: 0.7792758941650391
좋아하는 영화4 Beach Part.. 에 대한 선호도: 0.7998638153076172


### 무작위 영화에 대한 선호도

In [116]:
user1, item1 = user_to_idx[6041], movie_to_idx[movies.movie_id.sample(1).iloc[0]]
user1_vector, item1_vector = als_model.user_factors[user1], als_model.item_factors[item1]
np.dot(user1_vector, item1_vector)

-0.38053557

### 비슷한 영화 추천받기

In [None]:
#Get similar items
item_id = idx_to_movie[my_favorite[0]]
n_similar = 3
similar = als_model.similar_items(item_id, n_similar)

recommended = movies[movies['movie_id'].isin([idx_to_movie[x] for x in similar[0]])]['title']
for i in range(n_similar):
    print('----'*10)
    print(f'영화id {item_id} 기반 추천 영화 {i+1}:')
    print(f'{recommended.iloc[i]}, 추천정도:{round(similar[1][i]*100,2)}%')

----------------------------------------
영화id 3139 기반 추천 영화 1:
Houseguest (1994), 추천정도:100.0%
----------------------------------------
영화id 3139 기반 추천 영화 2:
Shadow, The (1994), 추천정도:66.03%
----------------------------------------
영화id 3139 기반 추천 영화 3:
Loaded Weapon 1 (1993), 추천정도:63.68%


### 내가 가장 좋아할 만한 영화

In [138]:
#Get Recommendations
user_id = user_to_idx[6041]
recommended = als_model.recommend(user_id, csr_data[user_id])
r_movies = movies[movies['movie_id'].isin([idx_to_movie[x] for x in recommended[0]])]['title']
for i in range(len(r_movies)):
    print('----'*10)
    print(f'{idx_to_user[user_id]}님께 추천드리는 영화 {i+1}:')
    print(f'{r_movies.iloc[0]}, 추천정도:{round(recommended[1][i]*100,2)}%')

----------------------------------------
6041님께 추천드리는 영화 1:
Kicking and Screaming (1995), 추천정도:71.86%
----------------------------------------
6041님께 추천드리는 영화 2:
Kicking and Screaming (1995), 추천정도:69.95%
----------------------------------------
6041님께 추천드리는 영화 3:
Kicking and Screaming (1995), 추천정도:65.61%
----------------------------------------
6041님께 추천드리는 영화 4:
Kicking and Screaming (1995), 추천정도:64.97%
----------------------------------------
6041님께 추천드리는 영화 5:
Kicking and Screaming (1995), 추천정도:61.51%
----------------------------------------
6041님께 추천드리는 영화 6:
Kicking and Screaming (1995), 추천정도:58.24%
----------------------------------------
6041님께 추천드리는 영화 7:
Kicking and Screaming (1995), 추천정도:55.67%
----------------------------------------
6041님께 추천드리는 영화 8:
Kicking and Screaming (1995), 추천정도:54.85%
----------------------------------------
6041님께 추천드리는 영화 9:
Kicking and Screaming (1995), 추천정도:51.73%
----------------------------------------
6041님께 추천드리는 영화 10:
Kicking and Screaming