# 協調フィルタリング(Collaborative filtering)
- kNN Item-based Collaborative filtering
- Dataset
  - MovieLens data sets
  - https://grouplens.org/datasets/movielens/100k/
  - MovieLens 100K movie ratings. Stable benchmark dataset. 100,000 ratings from 1000 users on 1700 movies.

In [34]:
from pathlib import Path
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [71]:
 !ls '/path/to/dataset'

README.txt  links.csv   movies.csv  ratings.csv tags.csv


In [35]:
data_dir = Path('/path/to/dataset')

movies_fname = 'movies.csv'
ratings_fname = 'ratings.csv'

movies = pd.read_csv(data_dir / movies_fname, dtype={'title':str, 'genres':str})
ratings = pd.read_csv(data_dir / ratings_fname, parse_dates=['timestamp'])

display(movies.head())
display(ratings.head())

fav_movie = 'Grumpier Old Men (1995)' # Newly bought movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [27]:
# Create Item-User matrix
recommend_table = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0) # FIXME fill nan to zero. OK?
recommend_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
# Create Sparse matrix
recommend_table_sparse = csr_matrix(recommend_table.values)
print(recommend_table_sparse)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [68]:
### Inference
# define model and fit
knn_k = 10
n_rec = 5 # the number of recommend
model = NearestNeighbors(n_neighbors=knn_k)
model.fit(recommend_table_sparse)

# get movieId of fav_movie(favorite movie)
movie_id = movies.query(f'title == "{fav_movie}"').movieId.values # FIXME: fav_movieが既存のリストにない場合を想定し、Fuzzyを使った類似度判定を用いてIdxを取得した方が良い。

# get Nearest Neighbors
knn_dist, knn_idx = model.kneighbors(recommend_table_sparse[movie_id], n_rec)
print(knn_idx)

# get recommended movies list
rec_movie_idxs = knn_idx.squeeze().tolist()
rec_movies = [movies.query(f'movieId == {idx}').title.values[0] for idx in rec_movie_idxs]
print(rec_movies)

[[  3 612 445 317 160]]
['Grumpier Old Men (1995)', 'Pallbearer, The (1996)', 'Fatal Instinct (1993)', 'Santa Clause, The (1994)', 'Congo (1995)']


In [69]:
movies.query(f'movieId == {rec_movie_idxs}')

Unnamed: 0,movieId,title,genres
2,3,Grumpier Old Men (1995),Comedy|Romance
133,160,Congo (1995),Action|Adventure|Mystery|Sci-Fi
276,317,"Santa Clause, The (1994)",Comedy|Drama|Fantasy
387,445,Fatal Instinct (1993),Comedy
524,612,"Pallbearer, The (1996)",Comedy
