In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df_movie=pd.read_csv("movies.csv",usecols=['movieId','title'])
df_movie.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [3]:
df_rating=pd.read_csv("ratings.csv",usecols=['userId','movieId','rating'])
df_rating.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [4]:
# Merging the 2 Dataframes
df_movie_rating=pd.merge(df_movie,df_rating,on=['movieId'])
df_movie_rating.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),15,4.0
1,1,Toy Story (1995),17,5.0
2,1,Toy Story (1995),23,5.0
3,1,Toy Story (1995),28,4.0
4,1,Toy Story (1995),34,3.0


In [5]:
# Finding the total count of ratings each movie recieved from unique users
rating_count=df_movie_rating.groupby('title').agg({'rating':'count'}).reset_index().rename(columns={'rating':'rating_count'})
rating_count.head()

Unnamed: 0,title,rating_count
0,"""Great Performances"" Cats (1998)",207
1,#1 Cheerleader Camp (2010),5
2,#chicagoGirl: The Social Network Takes on a Di...,3
3,$ (Dollars) (1971),27
4,$5 a Day (2008),50


In [6]:
df_movie_rating_count=pd.merge(df_movie_rating,rating_count,left_on=['title'],right_on='title',how='left')
df_movie_rating_count.head()

Unnamed: 0,movieId,title,userId,rating,rating_count
0,1,Toy Story (1995),15,4.0,60424
1,1,Toy Story (1995),17,5.0,60424
2,1,Toy Story (1995),23,5.0,60424
3,1,Toy Story (1995),28,4.0,60424
4,1,Toy Story (1995),34,3.0,60424


In [7]:
# Considering only those movies with rating count greater than 10,000.
df_movie_rating_count=df_movie_rating_count[df_movie_rating_count['rating_count']>10000]
df_movie_rating_count.head()

Unnamed: 0,movieId,title,userId,rating,rating_count
0,1,Toy Story (1995),15,4.0,60424
1,1,Toy Story (1995),17,5.0,60424
2,1,Toy Story (1995),23,5.0,60424
3,1,Toy Story (1995),28,4.0,60424
4,1,Toy Story (1995),34,3.0,60424


In [8]:
# Checking which movie recieved what rating by which of the unique users
df_movie_table=df_movie_rating_count.groupby(['title','userId'])['rating'].max().unstack().fillna(0)
df_movie_table

userId,1,2,3,4,5,6,7,8,9,10,...,247744,247745,247746,247747,247748,247749,247750,247751,247752,247753
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"X-Files: Fight the Future, The (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
X-Men (2000),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
X2: X-Men United (2003),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
You've Got Mail (1998),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Creating a matrix of the pivot table so as to locate the nearest neighbors of each user rated for the same movie.
df_movie_mat=csr_matrix(df_movie_table.values)
movie_near=NearestNeighbors(metric='cosine',algorithm='brute')
movie_near.fit(df_movie_mat)

NearestNeighbors(algorithm='brute', metric='cosine')

In [15]:
# Considering a random index value of a movie
random_index=np.random.choice(df_movie_table.shape[0])
random_index

187

In [16]:
# The distance and index of the neigbboring movies are calculated
distance,index=movie_near.kneighbors(df_movie_table.iloc[random_index,:].values.reshape(1,-1),n_neighbors=6)

In [17]:
# Top 5 movie recommendations for a movie using Nearest Neighbors
for i in range(len(distance.flatten())):
    if i==0:
        print("Top 5 recommendations for the movie: {}\n".format(df_movie_table.index[random_index]))
    else:
        print("{}:{}".format(i,df_movie_table.index[index.flatten()[i]]))

Top 5 recommendations for the movie: Fifth Element, The (1997)

1:Men in Black (a.k.a. MIB) (1997)
2:Matrix, The (1999)
3:Terminator, The (1984)
4:Total Recall (1990)
5:X-Men (2000)
