In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.io import mmwrite
from scipy.io import mmread
from scipy.sparse import csr_matrix
from tqdm import tqdm

In [2]:
train_df = pd.read_pickle('../data/train.pkl')
test_df = pd.read_pickle('../data/test.pkl')
movies = pd.read_csv('../data/movie.csv')

In [3]:
train_df.shape

(18058339, 4)

In [4]:
test_df.shape

(1941924, 4)

In [5]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
37,1,1217,7,2005-04-02 23:33:30
149,1,6754,8,2005-04-02 23:53:04
135,1,5146,7,2004-09-10 03:15:32
98,1,3000,7,2005-04-02 23:29:29
68,1,2021,8,2005-04-02 23:52:09


In [6]:
train_df.userId.nunique() * train_df.movieId.nunique()

3642919872

In [7]:
print(f"Density: {len(train_df) / (train_df.userId.nunique() * train_df.movieId.nunique()):.4f}")

Density: 0.0050


In [8]:
# pivot matrix example
train_df.sample(20).pivot(index='userId', columns='movieId', values='rating').fillna(0).astype(int)

movieId,242,375,480,1080,1086,1225,1348,1378,1453,1580,1909,2600,3994,4061,6238,8198,54286,91485,109731
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
4302,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8258,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10121,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11784,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0
12763,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0
24189,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
31404,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0
34419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0
76631,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
76967,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0


In [9]:
# id to index and name mapper
uid_to_idx = {uid: idx for (idx, uid) in enumerate(train_df.userId.unique().tolist())}
iid_to_idx = {iid: idx for (idx, iid) in enumerate(train_df.movieId.unique().tolist())}

idx_to_iid = {idx:iid for iid, idx in iid_to_idx.items()}
idx_to_uid = {idx:uid for uid, idx in uid_to_idx.items()}

iid_to_movie_name = dict(zip(movies.movieId.tolist(), movies.title.tolist()))
iid_to_movie_name = {iid: movie_name for (iid, movie_name) in iid_to_movie_name.items()}
idx_to_movie_name = {idx:iid_to_movie_name[iid] for (iid, idx) in iid_to_idx.items()}

In [10]:
# generate sparse matrix
row, col, dat = train_df.userId.tolist(), train_df.movieId.tolist(), train_df.rating.tolist()
row = [uid_to_idx[r] for r in row]
col = [iid_to_idx[c] for c in col]

train_matrix = csr_matrix((dat, (row,col)), shape=(1 + np.max(row), 1 + np.max(col)))
print(train_matrix.shape)

(138493, 26304)


### ALS implicit

아이템 p번 -> p_1, p_2, p_3 -> p (vector)  
유저 q번 -> q_1, q_2, q_3 -> q (vector)  
p * q -> rating (scalar)

In [11]:
import implicit

In [12]:
model_als = implicit.als.AlternatingLeastSquares(factors=20, use_gpu=False)
model_als.fit(train_matrix.T.tocsr())



HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [33]:
import pickle
pickle.dump(model_als, open('../data/als_20.pkl', 'wb'))

In [24]:
user_factor_df = pd.DataFrame(model_als.user_factors)
user_factor_df.index = user_factor_df.index.map(idx_to_uid)
user_factor_df.to_pickle("../data/user_factor.pkl")

In [31]:
item_factor_df = pd.DataFrame(model_als.item_factors)
item_factor_df.index = item_factor_df.index.map(idx_to_iid)
item_factor_df.to_pickle("../data/item_factor.pkl")

### Candidate generation

In [41]:
res = {}
for userId in tqdm(train_df.userId.unique()):
    res[userId] = [idx_to_iid[i[0]] for i in model_als.recommend(uid_to_idx[userId], train_matrix, 150)]

100%|██████████| 138493/138493 [01:50<00:00, 1257.67it/s]


In [43]:
pd.Series(res).to_pickle('../data/als_candidate_150.pkl')

### 정성적  평가

In [13]:
target_item = 102125 # ironman3

In [14]:
# 평가하기 쉽게 20개 미만의 영화를 본 유저만 찾기
for _ in range(1000):
    target_user = train_df[train_df.movieId==target_item].sample().userId.values[0]
    target_df = train_df[train_df.userId == target_user]
    if len(target_df) < 20:
        break

In [15]:
target_df.merge(movies, on='movieId')

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,71495,91325,8,2013-05-29 21:47:46,Extremely Loud and Incredibly Close (2011),Drama
1,71495,77561,10,2013-05-29 21:48:31,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX
2,71495,97923,9,2013-05-29 21:42:36,Flight (2012),Drama
3,71495,87430,6,2013-05-29 21:38:25,Green Lantern (2011),Action|Adventure|Sci-Fi
4,71495,51925,7,2013-05-29 21:39:35,Premonition (2007),Drama|Fantasy|Mystery|Thriller
5,71495,94780,6,2013-05-29 21:42:06,Snow White and the Huntsman (2012),Action|Adventure|Drama
6,71495,2571,10,2013-05-29 21:48:55,"Matrix, The (1999)",Action|Sci-Fi|Thriller
7,71495,3510,8,2013-05-29 19:02:05,Frequency (2000),Drama|Thriller
8,71495,4155,9,2013-05-29 19:08:39,Sweet November (2001),Drama|Romance
9,71495,51935,4,2013-05-29 19:07:40,Shooter (2007),Action|Drama|Thriller


In [16]:
# recommend by user id
movies.set_index('movieId').loc[[idx_to_iid[i[0]] for i in model_als.recommend(uid_to_idx[target_user], train_matrix, 10)]]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
3578,Gladiator (2000),Action|Adventure|Drama
89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
3793,X-Men (2000),Action|Adventure|Sci-Fi
58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
72998,Avatar (2009),Action|Adventure|Sci-Fi|IMAX
5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
68358,Star Trek (2009),Action|Adventure|Sci-Fi|IMAX


In [17]:
# recommend by item id
movies.set_index('movieId').loc[[idx_to_iid[i[0]] for i in model_als.similar_items(iid_to_idx[target_item], 20)]]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX
95510,"Amazing Spider-Man, The (2012)",Action|Adventure|Sci-Fi|IMAX
110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX
102445,Star Trek Into Darkness (2013),Action|Adventure|Sci-Fi|IMAX
106489,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy|IMAX
111362,X-Men: Days of Future Past (2014),Action|Adventure|Sci-Fi
91542,Sherlock Holmes: A Game of Shadows (2011),Action|Adventure|Comedy|Crime|Mystery|Thriller
88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War
106487,"Hunger Games: Catching Fire, The (2013)",Action|Adventure|Sci-Fi|IMAX
91630,Mission: Impossible - Ghost Protocol (2011),Action|Adventure|Thriller|IMAX


### API 구성해서 속도 재보기

In [18]:
def recommendAPI(rec_model, rec_type, rec_count, target_id):
    assert rec_type in ('user', 'item')
    if rec_type == 'user':
        return [idx_to_iid[i[0]] for i in rec_model.recommend(uid_to_idx[target_id], train_matrix, rec_count)]
    else:
        return [idx_to_iid[i[0]] for i in rec_model.similar_items(iid_to_idx[target_item], 20)]

In [19]:
sampled_movie_ids = train_df.movieId.sample(100000)

In [20]:
reco_result = {}
for movie_id in tqdm(sampled_movie_ids):
    reco_result[movie_id] = recommendAPI(model_als, 'item', 10, movie_id)

100%|██████████| 100000/100000 [00:27<00:00, 3609.95it/s]


**LRU cache 사용**

In [21]:
from functools import lru_cache

In [22]:
@lru_cache(1000) # decorator
def recommendAPI(rec_model, rec_type, rec_count, target_id):
    assert rec_type in ('user', 'item')
    if rec_type == 'user':
        return [idx_to_iid[i[0]] for i in rec_model.recommend(uid_to_idx[target_id], train_matrix, rec_count)]
    else:
        return [idx_to_iid[i[0]] for i in rec_model.similar_items(iid_to_idx[target_id], 20)]

In [24]:
reco_result = {}
for movie_id in tqdm(sampled_movie_ids):
    reco_result[movie_id] = recommendAPI(model_als, 'item', 10, movie_id)

100%|██████████| 100000/100000 [00:12<00:00, 7973.12it/s]


In [28]:
recommended_ids = set()
for _, v in reco_result.items():
    recommended_ids |= set(v)

print(f"{len(reco_result)}의 아이템을 기준으로 추천했을 때 Coverage: {len(recommended_ids) / len(movies):.4f} ")

8491의 아이템을 기준으로 추천했을 때 Coverage: 0.6639 


### Inference

In [29]:
def inference(model, ids):
    reco_result = {}
    for userId in tqdm(ids):
        reco_result[userId] = recommendAPI(model, 'user', 20, userId)
    return pd.Series(reco_result)

In [30]:
als_result = inference(model_als, train_df.userId.unique())
als_result.to_pickle('../data/submit_als.pkl')

100%|██████████| 138493/138493 [01:05<00:00, 2101.02it/s]


### BPR

In [31]:
model_bpr = implicit.bpr.BayesianPersonalizedRanking(factors=20, use_gpu=False)
model_bpr.fit(train_matrix.T.tocsr())

HBox(children=(IntProgress(value=0), HTML(value='')))




In [32]:
bpr_result = inference(model_bpr, train_df.userId.unique())
bpr_result.to_pickle('../data/submit_bpr.pkl')

100%|██████████| 138493/138493 [01:09<00:00, 1998.20it/s]


### 어바웃타임 vs 아이언맨3

In [34]:
iron_man = 102125
about_time = 104374

display(movies.set_index('movieId').loc[[idx_to_iid[i[0]] for i in model_als.similar_items(iid_to_idx[iron_man], 20)]])
display(movies.set_index('movieId').loc[[idx_to_iid[i[0]] for i in model_als.similar_items(iid_to_idx[about_time], 20)]])

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX
95510,"Amazing Spider-Man, The (2012)",Action|Adventure|Sci-Fi|IMAX
110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX
102445,Star Trek Into Darkness (2013),Action|Adventure|Sci-Fi|IMAX
106489,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy|IMAX
111362,X-Men: Days of Future Past (2014),Action|Adventure|Sci-Fi
91542,Sherlock Holmes: A Game of Shadows (2011),Action|Adventure|Comedy|Crime|Mystery|Thriller
88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War
106487,"Hunger Games: Catching Fire, The (2013)",Action|Adventure|Sci-Fi|IMAX
91630,Mission: Impossible - Ghost Protocol (2011),Action|Adventure|Thriller|IMAX


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
104374,About Time (2013),Drama|Fantasy|Romance
106918,"Secret Life of Walter Mitty, The (2013)",Adventure|Comedy|Drama
111921,The Fault in Our Stars (2014),Drama|Romance
102407,"Great Gatsby, The (2013)",Drama
102903,Now You See Me (2013),Crime|Mystery|Thriller
99149,"Misérables, Les (2012)",Drama|Musical|Romance|IMAX
116823,The Hunger Games: Mockingjay - Part 1 (2014),Adventure|Sci-Fi|Thriller
116797,The Imitation Game (2014),Drama|Thriller
108190,Divergent (2014),Adventure|Romance|Sci-Fi|IMAX
117176,"Theory of Everything, The (2014)",Drama|Romance
