# Item-Item Collaborative Filtering

- recommend similar movies using nothing but movie ratings to compute similarity
- Top K most similar items are identified with kNN and Cosine Similarity

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Load Data

In [12]:
data_dir = '/Users/jujohnson/movielens/ml-latest-small'
movies_path = os.path.join(data_dir, 'movies.csv')
ratings_path = os.path.join(data_dir, 'ratings.csv')

In [14]:
movies_df = pd.read_csv(movies_path, usecols=['movieId', 'title'], dtype={ 'movieId': 'int32', 'title': 'str' })
ratings_df = pd.read_csv(ratings_path, usecols=['userId', 'movieId', 'rating'],
    dtype={ 'userId': 'int32', 'movieId': 'int32', 'rating': 'float32' })

In [16]:
print(f'movies shape {movies_df.shape}')
movies_df.head(2)

movies shape (9742, 2)


Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)


In [17]:
print(f'ratings shape {ratings_df.shape}')
ratings_df.head(2)

ratings shape (100836, 3)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0


In [19]:
movie_ratings = pd.merge(ratings_df, movies_df, on='movieId')
movie_ratings.shape

(100836, 4)

In [20]:
movie_ratings.dropna(axis=0, subset=['title'], inplace=True)

In [23]:
movie_rating_count = movie_ratings.groupby(by='title')['rating'].count() \
    .reset_index() \
    .rename(columns={ 'rating': 'totalRatingCount' })

movie_rating_count.head(5)

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [24]:
movie_ratings = movie_ratings.merge(movie_rating_count, left_on='title', right_on='title', how='left')
movie_ratings.head(5)


Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [25]:
movie_ratings.describe()

Unnamed: 0,userId,movieId,rating,totalRatingCount
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,58.758777
std,182.618491,35530.987199,1.042529,61.965384
min,1.0,1.0,0.5,1.0
25%,177.0,1199.0,3.0,13.0
50%,325.0,2991.0,3.5,39.0
75%,477.0,8122.0,4.0,84.0
max,610.0,193609.0,5.0,329.0


In [28]:
popularity_threshold = 50
popular_ratings = movie_ratings.loc[movie_ratings.totalRatingCount >= popularity_threshold]

In [29]:
popular_ratings.head(5)

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


## Create Movie-User Matrix

In [32]:
movie_features = popular_ratings \
    .pivot_table(index='title', columns='userId', values='rating') \
    .fillna(0)
movie_features.head(5)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


## Convert to Sparse Row Matrix

In [34]:
from scipy.sparse import csr_matrix

movie_features_matrix = csr_matrix(movie_features.values)

movie_features_matrix

<450x606 sparse matrix of type '<class 'numpy.float32'>'
	with 41360 stored elements in Compressed Sparse Row format>

## Fit kNN Model

In [36]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(movie_features_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

## Generate Recommendations

In [79]:
# selecting a random movie as query for new movies
query_index = np.random.choice(movie_features.shape[0])
print(f'Selected a movie to generate recommendations for: {movie_features.index[query_index]}')

Selected a movie to generate recommendations for: Zoolander (2001)


In [80]:
# get 10 nearest neighbors
distance, indices = knn_model.kneighbors(
    movie_features.iloc[query_index].values.reshape(1, -1),
    n_neighbors=10)

In [81]:
for d, i in zip(distance.flatten(), indices.flatten()):
    recommended_movie = movie_features.index[i]
    print(f'Recommendation with distance {d:.2f}: {recommended_movie}')

Recommendation with distance 0.00: Zoolander (2001)
Recommendation with distance 0.36: Anchorman: The Legend of Ron Burgundy (2004)
Recommendation with distance 0.45: Wedding Crashers (2005)
Recommendation with distance 0.45: Superbad (2007)
Recommendation with distance 0.46: Bruce Almighty (2003)
Recommendation with distance 0.46: Knocked Up (2007)
Recommendation with distance 0.47: School of Rock (2003)
Recommendation with distance 0.48: Shaun of the Dead (2004)
Recommendation with distance 0.48: Miss Congeniality (2000)
Recommendation with distance 0.48: Kill Bill: Vol. 2 (2004)


In [85]:
# try again
query_index = np.random.choice(movie_features.shape[0])
print(f'Selected a movie to generate recommendations for: {movie_features.index[query_index]}')

distance, indices = knn_model.kneighbors(
    movie_features.iloc[query_index].values.reshape(1, -1),
    n_neighbors=10)

for d, i in zip(distance.flatten(), indices.flatten()):
    recommended_movie = movie_features.index[i]
    print(f'Recommendation with distance {d:.2f}: {recommended_movie}')

Selected a movie to generate recommendations for: Dark Knight, The (2008)
Recommendation with distance 0.00: Dark Knight, The (2008)
Recommendation with distance 0.27: Inception (2010)
Recommendation with distance 0.33: Iron Man (2008)
Recommendation with distance 0.33: Dark Knight Rises, The (2012)
Recommendation with distance 0.35: Batman Begins (2005)
Recommendation with distance 0.38: Lord of the Rings: The Return of the King, The (2003)
Recommendation with distance 0.40: Departed, The (2006)
Recommendation with distance 0.41: Lord of the Rings: The Fellowship of the Ring, The (2001)
Recommendation with distance 0.41: Lord of the Rings: The Two Towers, The (2002)
Recommendation with distance 0.42: Up (2009)
