In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [2]:
ratings = pd.read_csv('ml-latest/ratings.csv')
print(ratings.head())
  
movies = pd.read_csv("movies_FINAL.csv")
print(movies.head())
  
n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['userId'].unique())

   userId  movieId  rating   timestamp
0       1      307     3.5  1256677221
1       1      481     3.5  1256677456
2       1     1091     1.5  1256677471
3       1     1257     4.5  1256677460
4       1     1449     4.5  1256677264
   movieId                        title  year    rating  Western  Film-Noir  \
0        1                    Toy Story  1995  3.886649        0          0   
1        2                      Jumanji  1995  3.246583        0          0   
2        3             Grumpier Old Men  1995  3.173981        0          0   
3        4            Waiting to Exhale  1995  2.874540        0          0   
4        5  Father of the Bride Part II  1995  3.077291        0          0   

   IMAX  Musical  Sci-Fi  Adventure  ...  Action  War  Mystery  Animation  \
0     0        0       0          1  ...       0    0        0          1   
1     0        0       0          1  ...       0    0        0          0   
2     0        0       0          0  ...       0    0       

In [3]:
def create_matrix(df):
      
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())
      
    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
      
    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
      
    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]
  
    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
      
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [4]:
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
      
    neighbour_ids = []
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [5]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [6]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

In [8]:
movie_id = 1
similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]
print(f"Ponieważ obejżałeś {movie_title}, może polubisz:\n")
for i in similar_ids:
    print(movie_titles[i])

Ponieważ obejżałeś Toy Story, może polubisz:

Star Wars: Episode IV - A New Hope
Independence Day (a.k.a. ID4)
Toy Story 2
Back to the Future
Jurassic Park
Forrest Gump
Lion King, The
Mission: Impossible
Star Wars: Episode VI - Return of the Jedi
Aladdin


In [7]:
movie_id = 306
similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]
print(f"Ponieważ obejżałeś {movie_title}, może polubisz:\n")
for i in similar_ids:
    print(movie_titles[i])

Ponieważ obejżałeś Three Colors: Red (Trois couleurs: Rouge), może polubisz:

Three Colors: Blue (Trois couleurs: Bleu)
Three Colors: White (Trzy kolory: Bialy)
Double Life of Veronique, The (Double Vie de Véronique, La)
Eat Drink Man Woman (Yin shi nan nu)
Piano, The
Short Cuts
Beauty of the Day (Belle de jour)
Smoke
Like Water for Chocolate (Como agua para chocolate)
Heavenly Creatures
