# 데이터 전처리

In [3]:
import numpy as np
import scipy
import implicit
import os
import pandas as pd

In [4]:
rating_file_path = os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id',  'movie_id','ratings', 'timesteamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding="ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timesteamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f"orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}")
print(f"Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}")

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [6]:
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [7]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [8]:
ratings.head()

Unnamed: 0,user_id,movie_id,counts,timesteamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
movie_file_path = os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id','title','genre']
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
movies.tail()

Unnamed: 0,movie_id,title,genre
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


# 유니크한 영화, 사용자 수 / top 30

In [11]:
print("영화 개수:",movies['title'].nunique())
print("영화 개수:",ratings['movie_id'].nunique())
print("유저 수",ratings['user_id'].nunique())

영화 개수: 3883
영화 개수: 3628
유저 수 6039


In [12]:
movie_count = ratings.groupby('movie_id')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

# 내가 선호하는 영화를 추가

In [13]:
favorite_movie_list = [2858, 527, 1, 457, 1265]

my_list = pd.DataFrame({'user_id':['jaemin_no']*5,'movie_id':favorite_movie_list ,'counts':[5]*5})

if not ratings.isin({'user_id':['jaemin_no']})['user_id'].any():
    ratings = ratings.append(my_list)
    
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timesteamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,jaemin_no,2858,5,
1,jaemin_no,527,5,
2,jaemin_no,1,5,
3,jaemin_no,457,5,
4,jaemin_no,1265,5,


# 모델에 활용하기 위한 전처리

In [14]:
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

In [15]:
user_unique

array([1, 2, 3, ..., 6039, 6040, 'jaemin_no'], dtype=object)

In [16]:
movie_unique

array([1193,  661,  914, ...,  690, 2909, 1360])

In [17]:
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [18]:
print(user_to_idx['jaemin_no'])
print(movie_to_idx[2909])

6039
3626


In [19]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data
else:
    print('user_id column indexing Fail!!')
    
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print("movie column indexing OK!!")
    ratings['movie_id'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

ratings

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timesteamp
0,0,0,5,978300760.0
1,0,1,3,978302109.0
2,0,2,3,978301968.0
3,0,3,4,978300275.0
4,0,4,5,978824291.0
...,...,...,...,...
0,6039,99,5,
1,6039,23,5,
2,6039,40,5,
3,6039,141,5,


# CSR matrix

In [20]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape=(num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

# MF model 구현

In [21]:
from implicit.als import AlternatingLeastSquares

In [22]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=True, iterations=15)

In [23]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [24]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

# 모델 평가

In [38]:
favorite_movie = 52
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(1128, 1.0),
 (308, 0.71926576),
 (294, 0.5642862),
 (1592, 0.530799),
 (225, 0.52723235),
 (1136, 0.5200534),
 (2034, 0.517471),
 (1215, 0.5020266),
 (412, 0.50104785),
 (1583, 0.49502522),
 (2426, 0.45548165),
 (1157, 0.45392242),
 (1757, 0.44667107),
 (1127, 0.43952003),
 (2849, 0.42773947)]

In [39]:
idx_to_movie = {v:k for k, v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

[52,
 348,
 1449,
 492,
 728,
 125,
 417,
 1966,
 1701,
 2356,
 2295,
 1057,
 2750,
 45,
 156]

In [42]:
def get_similar_movie(movie_id: int):
    movie_id = movie_to_idx[movie_id]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [43]:
# 좋아하는 영화 관련 추천
get_similar_movie(23)

[23, 1598, 61, 782, 1427, 2778, 132, 436, 1003, 1661]

In [44]:
# 그외 영화 관련 추천
get_similar_movie(1427)

[1427, 2835, 1626, 544, 2778, 1598, 61, 1671, 2974, 23]

In [46]:
get_similar_movie(1)

[1, 3114, 34, 2355, 588, 1265, 2321, 364, 595, 1923]

In [47]:
# 나에게 영화 추천
user = user_to_idx['jaemin_no']
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(48, 0.5632113),
 (50, 0.5367853),
 (157, 0.4534663),
 (87, 0.43538684),
 (126, 0.43066472),
 (160, 0.42194107),
 (322, 0.4159584),
 (121, 0.3972596),
 (170, 0.3571862),
 (38, 0.34329474),
 (271, 0.29685465),
 (4, 0.26341894),
 (51, 0.2603194),
 (255, 0.25594795),
 (92, 0.25172037),
 (450, 0.25064388),
 (487, 0.24757712),
 (124, 0.23487037),
 (248, 0.23411375),
 (22, 0.23099527)]

무비 이름명을 무비 아이디명으로 일치화시키지 못 했다.