In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
ratingCols = ['UserID','MovieID','Rating','Timestamp']
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python',header=None,names=ratingCols)
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
userCols = ['UserID','Gender','Age','Occupation','Zip-code']
users = pd.read_csv('ml-1m/users.dat', sep='::', engine='python',header=None,names=userCols)
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
movieCols = ['MovieID','Title','Genres']
movies = pd.read_csv('ml-1m/movies.dat', sep='::', engine='python',header=None,names=movieCols)
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
#Starting the process to remove the lesser known movies to remove noise
merge_movie_ratings = pd.merge(ratings,movies,on='MovieID')
drop_extra_columns = ['Timestamp','Genres']
merge_movie_ratings = merge_movie_ratings.drop(drop_extra_columns,axis=1)
merge_movie_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Title
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975)
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975)
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975)
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975)


In [6]:
merge_movie_ratings = merge_movie_ratings.dropna(axis = 0, subset=['Title'])

In [7]:
movie_ratingCount = (merge_movie_ratings.groupby(by = ['Title'])['Rating'].count().reset_index())
movie_ratingCount.head()

Unnamed: 0,Title,Rating
0,"$1,000,000 Duck (1971)",37
1,'Night Mother (1986),70
2,'Til There Was You (1997),52
3,"'burbs, The (1989)",303
4,...And Justice for All (1979),199


In [8]:
#Now Renaming the columns in movie_ratingCount
#movie_ratingCount = movie_ratingCount.rename(columns={'Title':'Rating'})[['movieTitle','totalRating']]
movie_ratingCount.columns = ['Title','totalRating']
movie_ratingCount.head()

Unnamed: 0,Title,totalRating
0,"$1,000,000 Duck (1971)",37
1,'Night Mother (1986),70
2,'Til There Was You (1997),52
3,"'burbs, The (1989)",303
4,...And Justice for All (1979),199


In [9]:
rating_with_totalRating = merge_movie_ratings.merge(movie_ratingCount,left_on='Title',right_on='Title',how='left')
rating_with_totalRating.head()

Unnamed: 0,UserID,MovieID,Rating,Title,totalRating
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),1725
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975),1725
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975),1725
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975),1725
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975),1725


In [10]:
print(rating_with_totalRating)

         UserID  MovieID  Rating  \
0             1     1193       5   
1             2     1193       5   
2            12     1193       4   
3            15     1193       4   
4            17     1193       5   
5            18     1193       4   
6            19     1193       5   
7            24     1193       5   
8            28     1193       3   
9            33     1193       5   
10           39     1193       5   
11           42     1193       3   
12           44     1193       4   
13           47     1193       4   
14           48     1193       4   
15           49     1193       4   
16           53     1193       5   
17           54     1193       5   
18           58     1193       5   
19           59     1193       4   
20           62     1193       4   
21           80     1193       4   
22           81     1193       5   
23           88     1193       5   
24           89     1193       5   
25           95     1193       5   
26           96     1193    

In [11]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRating'].describe())

count   3706.000
mean     269.889
std      384.048
min        1.000
25%       33.000
50%      123.500
75%      350.000
max     3428.000
Name: totalRating, dtype: float64


In [12]:
print(movie_ratingCount['totalRating'].quantile(np.arange(.9,1,.01)))

0.900    729.500
0.910    773.550
0.920    825.000
0.930    887.300
0.940    971.400
0.950   1051.500
0.960   1133.800
0.970   1268.100
0.980   1446.600
0.990   1784.900
Name: totalRating, dtype: float64


In [30]:
threshold = 200
rating_popular_movie = movie_ratingCount.query('totalRating >= @threshold')
print(rating_popular_movie)

                                                  Title  totalRating
3                                    'burbs, The (1989)          303
6                     10 Things I Hate About You (1999)          700
7                                 101 Dalmatians (1961)          565
8                                 101 Dalmatians (1996)          364
9                                   12 Angry Men (1957)          616
10                             13th Warrior, The (1999)          750
12                          2 Days in the Valley (1996)          286
14                  20,000 Leagues Under the Sea (1954)          575
16                         2001: A Space Odyssey (1968)         1716
17                                          2010 (1984)          470
20                                       28 Days (2000)          505
24                                 39 Steps, The (1935)          253
28                                            54 (1998)          259
29                     7th Voyage 

In [31]:
rating_with_totalRating_pivot = rating_with_totalRating.pivot(index='Title',columns='UserID',values='Rating').fillna(0)
rating_with_totalRating_matrix = csr_matrix(rating_with_totalRating_pivot.values)

In [33]:
#print(rating_with_totalRating_matrix)

In [34]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine',algorithm = 'brute')
model_knn.fit(rating_with_totalRating_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [37]:
query_index = np.random.choice(rating_with_totalRating_pivot.shape[0])
distances, indices = model_knn.kneighbors(rating_with_totalRating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 10)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(rating_with_totalRating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, rating_with_totalRating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Fire Down Below (1997):

1: Glimmer Man, The (1996), with distance of 0.4485907943367682:
2: Under Siege 2: Dark Territory (1995), with distance of 0.5583336424138674:
3: Double Team (1997), with distance of 0.6101772279100419:
4: Marked for Death (1990), with distance of 0.6602491614712881:
5: Maximum Risk (1996), with distance of 0.6647367695500201:
6: Death Wish 3 (1985), with distance of 0.6757022648654121:
7: Hard Rain (1998), with distance of 0.6789478144470118:
8: Striking Distance (1993), with distance of 0.6823395539008346:
9: Under Siege (1992), with distance of 0.6846066181967783:
