In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
movies_df = pd.read_csv("ml-latest-small/movies.csv", usecols = ['movieId', 'title'], dtype = {'movieId': 'int32', 'title': 'str'})
rating_df = pd.read_csv("ml-latest-small/ratings.csv", usecols = ['userId', 'movieId', 'rating'], dtype = {'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
df = pd.merge(movies_df, rating_df, on = 'movieId')
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [6]:
df.shape

(100836, 4)

In [7]:
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
combine_movie_rating.shape

(100836, 4)

In [8]:
movie_rating_count = combine_movie_rating.groupby(by = ['title'])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'})[['title', 'totalRatingCount']]
movie_rating_count.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [9]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [10]:
rating_with_totalRatingCount.shape

(100836, 5)

In [11]:
movie_rating_count.describe()

Unnamed: 0,totalRatingCount
count,9719.0
mean,10.375141
std,22.40622
min,1.0
25%,1.0
50%,3.0
75%,9.0
max,329.0


In [12]:
popularity_threshold = 50
rating_popular_movie = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [13]:
rating_popular_movie.shape

(41362, 5)

In [14]:
movie_features_df = rating_popular_movie.pivot_table(index = 'title', columns = 'userId', values = 'rating').fillna(0)
movie_features_df[30:40]

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Armageddon (1998),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,3.0,0.0,5.0,3.5,0.0,0.0
Army of Darkness (1993),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,4.5
As Good as It Gets (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.5,0.0,4.0
Austin Powers in Goldmember (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.5
Austin Powers: International Man of Mystery (1997),5.0,0.0,0.0,4.0,0.0,0.0,3.5,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,4.0,0.0,0.0,3.0,0.0,0.0
Austin Powers: The Spy Who Shagged Me (1999),0.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,4.0,0.0,0.0,3.0,0.0,0.0
Avatar (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,...,4.0,0.0,0.0,0.0,3.5,3.0,0.0,0.0,0.0,4.5
"Avengers, The (2012)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
Babe (1995),0.0,0.0,0.0,0.0,4.0,4.0,0.0,5.0,0.0,0.0,...,0.0,1.0,4.0,4.0,0.0,0.0,3.0,3.5,0.0,0.0
Back to the Future (1985),5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,...,0.0,0.0,2.0,0.0,4.0,3.5,3.0,2.0,0.0,5.0


In [15]:
movie_features_df.shape

(450, 606)

In [16]:
movie_features_df.values

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 3. , 0. , 4.5],
       ...,
       [5. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 3. , 0. , ..., 0. , 0. , 3.5],
       [0. , 0. , 0. , ..., 3. , 0. , 4. ]], dtype=float32)

In [17]:
movie_features_df_matrix = csr_matrix(movie_features_df.values)
movie_features_df_matrix

<450x606 sparse matrix of type '<class 'numpy.float32'>'
	with 41360 stored elements in Compressed Sparse Row format>

In [18]:
knn_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_model.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [19]:
query_index = np.random.choice(movie_features_df.shape[0])
query_index

218

In [20]:
distances, indices = knn_model.kneighbors(movie_features_df.iloc[37, :].values.reshape(1, -1), n_neighbors = 6)

In [21]:
distances

array([[2.3841858e-07, 2.7639878e-01, 3.4132457e-01, 3.8764435e-01,
        4.1349632e-01, 4.2860407e-01]], dtype=float32)

In [22]:
indices

array([[ 37, 189, 220, 115, 212, 350]], dtype=int64)

In [23]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print(f"Recommendations for {movie_features_df.index[37]}")
    else:
        print(f"{i}. {movie_features_df.index[indices.flatten()[i]]}, with a distance of {distances.flatten()[i]}")

Recommendations for Avengers, The (2012)
1. Guardians of the Galaxy (2014), with a distance of 0.27639877796173096
2. Iron Man (2008), with a distance of 0.3413245677947998
3. Dark Knight Rises, The (2012), with a distance of 0.38764435052871704
4. Inception (2010), with a distance of 0.41349631547927856
5. Sherlock Holmes (2009), with a distance of 0.4286040663719177
