In [1]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors



In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import boto3

In [3]:
movies_csv_url = 'https://gt-parrothunters-finalproject.s3.us-east-2.amazonaws.com/movies.csv'
df_movies = pd.read_csv(movies_csv_url)
df_movies.head()

Unnamed: 0.1,Unnamed: 0,movieId,title,genres
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,Jumanji (1995),Adventure|Children|Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,5,Father of the Bride Part II (1995),Comedy
4,5,6,Heat (1995),Action|Crime|Thriller


In [5]:
ratings_csv_url = 'ratings.csv'
df_ratings = pd.read_csv(ratings_csv_url)
df_ratings.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp
0,0,1,296,5.0,1147880044
1,1,1,306,3.5,1147868817
2,2,1,307,5.0,1147868828
3,4,1,899,3.5,1147868510
4,5,1,1088,4.0,1147868495


In [6]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1223 entries, 0 to 1222
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1223 non-null   int64 
 1   movieId     1223 non-null   int64 
 2   title       1223 non-null   object
 3   genres      1223 non-null   object
dtypes: int64(2), object(2)
memory usage: 38.3+ KB


In [7]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16358020 entries, 0 to 16358019
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   userId      int64  
 2   movieId     int64  
 3   rating      float64
 4   timestamp   int64  
dtypes: float64(1), int64(4)
memory usage: 624.0 MB


In [8]:
df_movies_cnt = pd.DataFrame(
            df_ratings.groupby('movieId').size(),
            columns=['count'])
popular_movies = list(set(df_movies_cnt.query('count >= 50').index))  #filtering out movies with fewer than 50 ratings
movies_filter = df_ratings.movieId.isin(popular_movies).values
movies_filter

array([ True,  True,  True, ...,  True,  True,  True])

In [9]:
df_users_cnt = pd.DataFrame(
            df_ratings.groupby('userId').size(),
            columns=['count'])
active_users = list(set(df_users_cnt.query('count >= 5').index))  # filtering out users with fewer than 10 ratings
users_filter = df_ratings.userId.isin(active_users).values
users_filter

array([ True,  True,  True, ...,  True,  True,  True])

In [10]:
df_ratings_filtered = df_ratings[movies_filter & users_filter]
df_ratings_filtered

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp
0,0,1,296,5.0,1147880044
1,1,1,306,3.5,1147868817
2,2,1,307,5.0,1147868828
3,4,1,899,3.5,1147868510
4,5,1,1088,4.0,1147868495
...,...,...,...,...,...
16358015,25000087,162541,33794,4.0,1240951792
16358016,25000088,162541,41566,4.0,1240952749
16358017,25000089,162541,45517,4.5,1240953353
16358018,25000090,162541,50872,4.5,1240953372


In [11]:
movie_user_mat = df_ratings_filtered.pivot(
    index='movieId', columns='userId', values='rating').fillna(0)
# create mapper from movie title to index
hashmap = {
    movie: i for i, movie in
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title)) 
}
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)
movie_user_mat_sparse
#return movie_user_mat_sparse, hashmap

<1223x162329 sparse matrix of type '<class 'numpy.float64'>'
	with 16357450 stored elements in Compressed Sparse Row format>

In [12]:
model = NearestNeighbors()

In [13]:
movie_name = 'Shrek (2001)'        #test movie

In [14]:
model.fit(movie_user_mat_sparse)

NearestNeighbors()

In [15]:
idx = hashmap[movie_name]                      #getting the index of the movie name
idx

807

In [16]:
distances, indices = model.kneighbors(
    movie_user_mat_sparse[idx],
    n_neighbors=11)                      #will only return 10

In [17]:
raw_recommends = \
    sorted(
        list(
            zip(
                indices.squeeze().tolist(),
                distances.squeeze().tolist()
            )
        ),
        key=lambda x: x[1]
    )[:0:-1]

raw_recommends

[(839, 691.0088639084162),
 (837, 687.8604509637111),
 (759, 686.0654123332556),
 (850, 679.9209512877214),
 (966, 677.1908888932278),
 (907, 667.2939756958698),
 (856, 656.8460626356833),
 (900, 635.4872146628916),
 (942, 624.4689744094577),
 (835, 624.120180734448)]

In [18]:
reverse_hashmap = {v: k for k, v in hashmap.items()}
print('Recommendations for {}:'.format(movie_name))
for i, (idx, dist) in enumerate(raw_recommends):
    print('{1}'.format(i+1, reverse_hashmap[idx]))


Recommendations for Shrek (2001):
Ocean's Eleven (2001)
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
X-Men (2000)
Ice Age (2002)
Incredibles, The (2004)
Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Spider-Man (2002)
Finding Nemo (2003)
Shrek 2 (2004)
Monsters, Inc. (2001)
