In [1]:
import numpy as np
import scipy
import implicit
import pandas as pd
import os

from implicit.als import AlternatingLeastSquares



In [2]:
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [70]:
data = pd.merge(left = ratings , right = movies, how = "inner", on = "movie_id")

In [22]:
data

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...
836473,5851,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western
836474,5854,3026,4,958346883,Slaughterhouse (1987),Horror
836475,5854,690,3,957744257,"Promise, The (Versprechen, Das) (1994)",Romance
836476,5938,2909,4,957273353,"Five Wives, Three Secretaries and Me (1998)",Documentary


In [33]:
from scipy.sparse import csr_matrix

num_user = data['user_id'].nunique()
num_movie = data['movie_id'].nunique()


In [34]:
num_user

6039

In [42]:

csr_data = csr_matrix((data.counts, (data.user_id, data.movie_id)), shape= (836478, 836478))
csr_data

<836478x836478 sparse matrix of type '<class 'numpy.int64'>'
	with 836478 stored elements in Compressed Sparse Row format>

In [43]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [44]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<836478x836478 sparse matrix of type '<class 'numpy.int64'>'
	with 836478 stored elements in Compressed Sparse Column format>

In [45]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [55]:
user_unique = data['user_id'].unique()
movie_unique = data['title'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

first, promise = user_to_idx[2], movie_to_idx['Promise, The (Versprechen, Das) (1994)']
first_vector, promise_vector = als_model.user_factors[first], als_model.item_factors[promise]


In [56]:
first_vector

array([ 0.00749544, -0.0073043 ,  0.00271611, -0.00961702,  0.00429526,
       -0.00941897, -0.03482876, -0.01910864,  0.01590506, -0.02429768,
       -0.01291146, -0.03106001,  0.01305896, -0.00571092,  0.03817327,
       -0.01558849,  0.02371497,  0.03004901,  0.00136437,  0.01416543,
        0.00167444, -0.00268177, -0.00138688,  0.01536061, -0.00338025,
       -0.00539524,  0.01512046, -0.02485852, -0.00459305,  0.01192592,
       -0.03701433, -0.00446929, -0.01939252,  0.03573187, -0.05567131,
        0.00585857, -0.00992691, -0.00839981, -0.03778696,  0.01008663,
       -0.00135515,  0.03322461, -0.00102064,  0.04930082,  0.0127031 ,
       -0.03046551,  0.00819258, -0.00887783, -0.01302473, -0.00058755,
       -0.01783507, -0.00073733, -0.02012542,  0.01236388, -0.00910644,
       -0.0003722 , -0.02798404, -0.01173415,  0.02018308,  0.01095363,
       -0.01774547,  0.01676306,  0.00020189,  0.02175684,  0.01238215,
        0.01210279,  0.00247703,  0.02219425, -0.02018783, -0.03

In [57]:
promise_vector

array([-0.14761305,  0.06105141, -0.31572515,  0.19173674,  0.06992842,
       -0.10291643,  0.16941357, -0.0972916 ,  0.11641043,  0.25921333,
        0.12627266, -0.08224919,  0.36415827,  0.0697158 , -0.03717011,
        0.09469551,  0.2549956 ,  0.10928232, -0.09844949,  0.16512083,
        0.03796704,  0.05284231, -0.12068702, -0.25811923,  0.21507415,
       -0.04774633,  0.16733183,  0.22872682, -0.10048043, -0.13837568,
       -0.08798311, -0.0048612 ,  0.02019797,  0.04693135, -0.03360322,
        0.08514511, -0.04107286, -0.0545406 , -0.1513733 ,  0.07627416,
       -0.04087064, -0.10333479,  0.25707516,  0.12586953, -0.01550535,
        0.01427577, -0.29947546, -0.36756596,  0.09589989,  0.0532654 ,
       -0.10307921,  0.22590144,  0.12432267, -0.28948945, -0.14331287,
        0.11923577,  0.12424033, -0.26730892, -0.01288827, -0.22923443,
       -0.1451984 , -0.36685935, -0.23384228,  0.0543437 , -0.21012776,
       -0.21415369,  0.10446949,  0.13376743,  0.27623883, -0.16

In [58]:
np.dot(first_vector, promise_vector)

0.015967205

## 1번사용자의 five wives 영화 선호도 예측

In [59]:
movie = movie_to_idx['Five Wives, Three Secretaries and Me (1998)']
movie_vector = als_model.item_factors[movie]
np.dot(first_vector, movie_vector)

0.014447

## 2) 분석해봅시다
1. ratings 에 있는 유니크한 영화 개수
2. ratings 에 있는 유니크한 사용자 수
3. 가장인기있는 영화 30개 (인기순)

In [64]:
num_movie = data['movie_id'].nunique()
num_user = data['user_id'].nunique()


movie_count = data.groupby('title')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

print(num_movie)
print(num_user)
print(movie_count)


3628
6039
title
$1,000,000 Duck (1971)                         26
'Night Mother (1986)                           56
'Til There Was You (1997)                      27
'burbs, The (1989)                            198
...And Justice for All (1979)                 185
                                             ... 
Zed & Two Noughts, A (1985)                    24
Zero Effect (1998)                            262
Zero Kelvin (Kjærlighetens kjøtere) (1995)      2
Zeus and Roxanne (1997)                        12
eXistenZ (1999)                               306
Name: user_id, Length: 3628, dtype: int64


## 내가좋아하는 영화 5개골라서 추가 후 재훈련 시켜보기

In [71]:
my_favorite = ['Toy Story (1995)' , 'Jumanji (1995)' ,'Grumpier Old Men (1995)' ,'Waiting to Exhale (1995)' ,'Father of the Bride Part II']
my_favorite_id = [1 , 2 ,3 ,4,5]
my_playlist = pd.DataFrame({'user_id': [6900]*5, 'title': my_favorite, 'movie_id':my_favorite_id, 'counts':[30]*5})

if not data.isin({'user_id':[6900]})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    data = data.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

data.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
836473,5851,3607,5,957756600.0,One Little Indian (1973),Comedy|Drama|Western
836474,5854,3026,4,958346900.0,Slaughterhouse (1987),Horror
836475,5854,690,3,957744300.0,"Promise, The (Versprechen, Das) (1994)",Romance
836476,5938,2909,4,957273400.0,"Five Wives, Three Secretaries and Me (1998)",Documentary
836477,5948,1360,5,1016564000.0,Identification of a Woman (Identificazione di ...,Drama
0,6900,1,30,,Toy Story (1995),
1,6900,2,30,,Jumanji (1995),
2,6900,3,30,,Grumpier Old Men (1995),
3,6900,4,30,,Waiting to Exhale (1995),
4,6900,5,30,,Father of the Bride Part II,


In [72]:
num_user = data['user_id'].nunique()
num_movie = data['movie_id'].nunique()
csr_data = csr_matrix((data.counts, (data.user_id, data.movie_id)), shape= (836483, 836483))
csr_data

<836483x836483 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [73]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<836483x836483 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [74]:
user_unique = data['user_id'].unique()
movie_unique = data['title'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

first, promise = user_to_idx[6900], movie_to_idx['Promise, The (Versprechen, Das) (1994)']
first_vector, promise_vector = als_model.user_factors[first], als_model.item_factors[promise]


In [75]:
np.dot(first_vector, promise_vector)

-0.012112925

## 영화추천받기


In [76]:
user = user_to_idx[6900]
# recommend에서는 user*item CSR Matrix를 받습니다.
artist_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
artist_recommended

[(910, 1.1952224),
 (3061, 1.0619671),
 (3675, 0.9131417),
 (898, 0.8763859),
 (594, 0.8742547),
 (1035, 0.8550602),
 (1247, 0.83302784),
 (1196, 0.8170548),
 (1267, 0.8091713),
 (3671, 0.7840781),
 (905, 0.7749871),
 (3730, 0.766903),
 (2078, 0.75072086),
 (928, 0.7331087),
 (1256, 0.7325462),
 (908, 0.73200005),
 (2941, 0.73132133),
 (1, 0.71435237),
 (3606, 0.713339),
 (541, 0.70922226)]

In [96]:
favorite_movie = 'Toy Story (1995)'
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

rec_result = (movies['movie_id']== artist_recommended[0][0])
movies.loc[rec_result]


Unnamed: 0,movie_id,title,genre
898,910,Some Like It Hot (1959),Comedy|Crime
