In [62]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [63]:
ratings = pd.read_csv('ml-latest-small/ratings.csv', sep=',', engine='python')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [64]:
movieCols = ['MovieID','Title','Genres']
movies = pd.read_csv('ml-latest-small/movies.csv', sep=',', engine='python')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [65]:
Add_names_to_ratings_table = pd.merge(ratings,movies,on='movieId')
drop_col = ['timestamp', 'genres']
# Add_names_to_users_table.head()
# drop
Add_names_to_ratings_table = Add_names_to_ratings_table.drop(drop_col,axis=1)
Add_names_to_ratings_table.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [66]:
Add_names_to_ratings_table = Add_names_to_ratings_table.dropna(axis = 0, subset=['title'])

In [67]:
movie_ratingCount = (Add_names_to_ratings_table.groupby(by = ['title'])['rating'].count().reset_index())
movie_ratingCount.columns = ['title','totalRating']
movie_ratingCount.head()

Unnamed: 0,title,totalRating
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [68]:
rating_with_totalRating = Add_names_to_ratings_table.merge(movie_ratingCount,left_on='title',right_on='title',how='left')
rating_with_totalRating.head()

Unnamed: 0,userId,movieId,rating,title,totalRating
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [69]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRating'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRating, dtype: float64


In [70]:
threshold = 25
rating_popular_movie = movie_ratingCount.query('totalRating >= @threshold').sort_values('title')
print(rating_popular_movie)

                                                  title  totalRating
8                           (500) Days of Summer (2009)           42
18                    10 Things I Hate About You (1999)           54
23                                101 Dalmatians (1996)           47
24    101 Dalmatians (One Hundred and One Dalmatians...           44
34                                  12 Angry Men (1957)           57
48                             13th Warrior, The (1999)           26
49                                          1408 (2007)           25
74                         2001: A Space Odyssey (1968)          109
80                                      21 Grams (2003)           25
81                                21 Jump Street (2012)           26
88                                       28 Days (2000)           25
89                                 28 Days Later (2002)           58
104                                          300 (2007)           80
111                               

In [111]:
rating_with_totalRating_pivot = rating_with_totalRating.pivot_table(index='title',columns='userId',values='rating'
                                                                    ,aggfunc=np.mean).fillna(0)
rating_with_totalRating_matrix = csr_matrix(rating_with_totalRating_pivot.values)
print(rating_with_totalRating_pivot)

userId                                               1     2     3     4    \
title                                                                        
'71 (2014)                                         0.000 0.000 0.000 0.000   
'Hellboy': The Seeds of Creation (2004)            0.000 0.000 0.000 0.000   
'Round Midnight (1986)                             0.000 0.000 0.000 0.000   
'Salem's Lot (2004)                                0.000 0.000 0.000 0.000   
'Til There Was You (1997)                          0.000 0.000 0.000 0.000   
'Tis the Season for Love (2015)                    0.000 0.000 0.000 0.000   
'burbs, The (1989)                                 0.000 0.000 0.000 0.000   
'night Mother (1986)                               0.000 0.000 0.000 0.000   
(500) Days of Summer (2009)                        0.000 0.000 0.000 0.000   
*batteries not included (1987)                     0.000 0.000 0.000 0.000   
...All the Marbles (1981)                          0.000 0.000 0

In [77]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine',algorithm = 'brute')
model_knn.fit(rating_with_totalRating_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [131]:
query_index = np.random.choice(rating_with_totalRating_pivot.shape[0])
distances, indices = model_knn.kneighbors(rating_with_totalRating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(rating_with_totalRating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, rating_with_totalRating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Taxi to the Dark Side (2007):

1: Who Killed Chea Vichea? (2010), with distance of 0.0:
2: Burma VJ: Reporting from a Closed Country (Burma VJ: Reporter i et lukket land) (2008), with distance of 0.0:
3: Army of Shadows (L'armÃ©e des ombres) (1969), with distance of 0.0:
4: Devil Dog: The Hound of Hell (1978), with distance of 0.0:
5: First Reformed (2017), with distance of 0.0:
