# Collaborative Filtering based Recommendation System

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances

In [2]:
# Load data
movies_filename = 'MovieLens/movies.csv'
ratings_filename = 'MovieLens/ratings.csv'
# read data
df_movies = pd.read_csv(movies_filename,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(ratings_filename,
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
df_movies.shape

(9742, 2)

In [5]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
df_ratings.shape

(100836, 3)

In [7]:
from scipy.sparse import csr_matrix
# pivot ratings into movie features
df_movie_features = df_ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)
# convert dataframe of movie features to scipy sparse matrix
mat_movie_features = csr_matrix(df_movie_features.values)

In [8]:
df_movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_movie_features.shape

(9724, 610)

In [10]:
mat_movie_features

<9724x610 sparse matrix of type '<class 'numpy.float32'>'
	with 100836 stored elements in Compressed Sparse Row format>

In [11]:
#Cosine similarity matrix
cosine_sim = 1-pairwise_distances(mat_movie_features, metric="cosine")
cosine_sim = pd.DataFrame(cosine_sim)
cosine_sim.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
0,1.0,0.410562,0.296917,0.035573,0.308762,0.376316,0.27749,0.131629,0.232586,0.395573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.410562,1.0,0.282438,0.106415,0.287795,0.297009,0.228576,0.172498,0.044835,0.417693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.296917,0.282438,1.0,0.092406,0.417802,0.284257,0.402831,0.313434,0.30484,0.242954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.035573,0.106415,0.092406,1.0,0.188376,0.089685,0.275035,0.158022,0.0,0.095598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.308762,0.287795,0.417802,0.188376,1.0,0.298969,0.474002,0.283523,0.335058,0.218061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
cosine_sim.shape

(9724, 9724)

In [13]:
# Similar movies like first movie
movie1_sim = cosine_sim[0]
index = np.argsort(np.array(movie1_sim))[::-1]

In [14]:
index

array([   0, 2353,  418, ..., 7714, 7709, 9723])

In [15]:
movie1_sim[index][0:10]

0       1.000000
2353    0.572601
418     0.565637
615     0.564262
224     0.557388
314     0.547096
322     0.541145
910     0.541089
546     0.538913
963     0.534169
Name: 0, dtype: float32

In [16]:
# Top 10 similar movies
top_movieId = df_movie_features.index[index][0:10]
top_movieId

Int64Index([1, 3114, 480, 780, 260, 356, 364, 1210, 648, 1265], dtype='int64', name='movieId')

In [17]:
for i in range(10):
    print(df_movies.loc[df_movies.movieId == top_movieId[i]])

   movieId             title
0        1  Toy Story (1995)
      movieId               title
2355     3114  Toy Story 2 (1999)
     movieId                 title
418      480  Jurassic Park (1993)
     movieId                                 title
615      780  Independence Day (a.k.a. ID4) (1996)
     movieId                                      title
224      260  Star Wars: Episode IV - A New Hope (1977)
     movieId                title
314      356  Forrest Gump (1994)
     movieId                  title
322      364  Lion King, The (1994)
     movieId                                              title
911     1210  Star Wars: Episode VI - Return of the Jedi (1983)
     movieId                       title
546      648  Mission: Impossible (1996)
     movieId                 title
964     1265  Groundhog Day (1993)
