## 1. 데이터 준비와 전처리

In [1]:
import os

from implicit.als import AlternatingLeastSquares
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [3]:
rating_file_path = 'recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
ratings['movie_id'] = ratings['movie_id'] - 1
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1192,5,978300760
1,1,660,3,978302109
2,1,913,3,978301968
3,1,3407,4,978300275
4,1,2354,5,978824291


In [4]:
ratings = ratings[ratings['ratings']>=3]
ratings = ratings.rename(columns={'ratings':'counts'})
ratings = ratings.reset_index(drop=True)
filtered_data_size = len(ratings)

print(f'기존 데이터셋 크기: {orginal_data_size}')
print(f'ratings가 3이상인 데이터셋 크기: {filtered_data_size}')
print(f'비율: {filtered_data_size / orginal_data_size:.2%}')

기존 데이터셋 크기: 1000209
ratings가 3이상인 데이터셋 크기: 836478
비율: 83.63%


In [5]:
movie_file_path = 'recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies['movie_id'] = movies['movie_id'] - 1
movies.head()

Unnamed: 0,movie_id,title,genre
0,0,Toy Story (1995),Animation|Children's|Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama
4,4,Father of the Bride Part II (1995),Comedy


In [6]:
movie_to_idx = {v: i for i, v in enumerate(movies['title'].to_list())}
idx_to_movie = {i: v for i, v in enumerate(movies['title'].to_list())}

## 2. 데이터 확인

In [7]:
ratings.user_id.nunique()

6039

In [8]:
ratings.movie_id.nunique()

3628

In [9]:
mean_ratings = ratings.groupby('movie_id')['counts'].mean().reset_index().sort_values('counts', ascending=False).head(50)
mean_ratings.merge(movies, how='left')

<img src=https://i.ibb.co/7Wgk1xC/2021-11-02-13-55-46.png width=500></img>

## 3. 본인의 선호 영화 5개 추가

In [10]:
my_rating = pd.DataFrame({
    'user_id': [0] * 5,
    'movie_id': [1830, 1553, 3233, 3866, 1148],
    'counts': [5] * 5,
    'timestamp': [978300019] * 5
})
my_rating

Unnamed: 0,user_id,movie_id,counts,timestamp
0,0,1830,5,978300019
1,0,1553,5,978300019
2,0,3233,5,978300019
3,0,3866,5,978300019
4,0,1148,5,978300019


In [11]:
ratings = pd.concat([ratings, my_rating]).sort_values('user_id').reset_index(drop=True)

In [12]:
ratings.head(10)

Unnamed: 0,user_id,movie_id,counts,timestamp
0,0,1148,5,978300019
1,0,3233,5,978300019
2,0,1553,5,978300019
3,0,1830,5,978300019
4,0,3866,5,978300019
5,1,3185,4,978300019
6,1,1565,4,978824330
7,1,587,4,978824268
8,1,1906,4,978824330
9,1,782,4,978824291


## 4. CSR Matrix 제작

In [13]:
user_size = ratings['user_id'].nunique()
movie_size = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings['counts'], (ratings['user_id'], ratings['movie_id'])))
csr_data

<6041x3952 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

## 5. AlternatingLeastSquares 모델 제작

In [14]:
als_model = AlternatingLeastSquares(factors=300, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [15]:
csr_data_T = csr_data.T
csr_data_T

<3952x6041 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [16]:
# 모델 훈련
als_model.fit(csr_data_T)

  0%|          | 0/15 [00:00<?, ?it/s]

## 6. 모델 결과 확인

In [17]:
# 본인의 user_id: 0
# 5점 준 영화 id: 1830, 1553, 3233, 3866, 1148

user_vector, movie_vector = als_model.user_factors[0], als_model.item_factors[1830]

In [18]:
np.dot(user_vector, movie_vector)

0.35905445

## 7. 본인 선호 영화와 비슷한 영화 확인

In [19]:
similar_artist = als_model.similar_items(1830, N=15)
similar_artist_ids = [idx for idx, similar in similar_artist]
similar_artist

[(1830, 1.0000001),
 (172, 0.57680625),
 (2548, 0.5687094),
 (2447, 0.5615835),
 (518, 0.5547224),
 (1881, 0.5481378),
 (2497, 0.546938),
 (378, 0.5436844),
 (1761, 0.54175556),
 (201, 0.5393514),
 (65, 0.53839),
 (545, 0.53838646),
 (1749, 0.53719586),
 (2806, 0.5371352),
 (1861, 0.5361654)]

In [20]:
movies[movies['movie_id'].isin(similar_artist_ids)]

Unnamed: 0,movie_id,title,genre
65,65,Lawnmower Man 2: Beyond Cyberspace (1996),Sci-Fi|Thriller
171,172,Judge Dredd (1995),Action|Adventure|Sci-Fi
200,201,Total Eclipse (1995),Drama|Romance
375,378,Timecop (1994),Action|Sci-Fi
515,518,Robocop 3 (1993),Sci-Fi|Thriller
542,545,Super Mario Bros. (1993),Action|Adventure|Children's|Sci-Fi
1698,1749,Star Kid (1997),Adventure|Children's|Fantasy|Sci-Fi
1708,1761,Deep Rising (1998),Action|Horror|Sci-Fi
1763,1830,Lost in Space (1998),Action|Sci-Fi|Thriller
1793,1861,Species II (1998),Horror|Sci-Fi


## 8. 추천 영화 결과 확인

In [21]:
movie_recommended = als_model.recommend(0, csr_data, N=10, filter_already_liked_items=True)
movie_recommended_ids = [i for i, j in movie_recommended]

In [22]:
movie_recommended

[(1543, 0.12709127),
 (2011, 0.11605163),
 (2733, 0.10832901),
 (1390, 0.10739152),
 (1357, 0.102593586),
 (159, 0.09697036),
 (306, 0.09637075),
 (1916, 0.0951787),
 (1249, 0.09472855),
 (3696, 0.09435858)]

In [23]:
movies[movies['movie_id'].isin(movie_recommended_ids)]

Unnamed: 0,movie_id,title,genre
158,159,Congo (1995),Action|Adventure|Mystery|Sci-Fi
304,306,Three Colors: Blue (1993),Drama
1230,1249,"Bridge on the River Kwai, The (1957)",Drama|War
1337,1357,Sling Blade (1996),Drama|Thriller
1370,1390,Mars Attacks! (1996),Action|Comedy|Sci-Fi|War
1505,1543,"Lost World: Jurassic Park, The (1997)",Action|Adventure|Sci-Fi|Thriller
1848,1916,Armageddon (1998),Action|Adventure|Sci-Fi|Thriller
1943,2011,Back to the Future Part III (1990),Comedy|Sci-Fi|Western
2665,2733,"Mosquito Coast, The (1986)",Drama
3628,3696,Predator 2 (1990),Action|Sci-Fi|Thriller


In [24]:
explain = als_model.explain(0, csr_data, itemid=2700)
explain[1]

[(1830, 0.0830016605862105),
 (1148, 0.011603697250216833),
 (3866, 0.003661105268089537),
 (3233, 0.00017358405459113322),
 (1553, -0.022825420784722314)]