In [3]:
import pandas as pd
import numpy as np
#from fuzzywuzzy import process
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import time

import joblib

In [4]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [5]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [6]:
ratings.drop('timestamp', axis = 1, inplace = True)
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [7]:
# Extracting duplicated movie ids
duplicate_movies = movies.groupby('title').filter(lambda x: len(x) == 2)
duplic_ids = duplicate_movies['movieId'].values
#Duplicated titles
duplicate_movies = duplicate_movies[['movieId','title']]
# Checking the id with most reviews
review_count = pd.DataFrame(ratings[ratings['movieId'].isin(duplic_ids)]['movieId'].value_counts())
review_count.reset_index(inplace=True)
review_count.columns = ['movieId','count']
duplicated_df = pd.merge(duplicate_movies, review_count, on='movieId')
display(duplicated_df)
## Getting duplicates with low review count
duplicated_df.sort_values(by=['title','count'],ascending=[True,False])
duplicated_ids = duplicated_df.drop_duplicates(subset ="title", 
                     keep = 'last', inplace = False)['movieId']

Unnamed: 0,movieId,title,count
0,838,Emma (1996),30
1,2851,Saturn 3 (1980),4
2,6003,Confessions of a Dangerous Mind (2002),15
3,26958,Emma (1996),1
4,32600,Eros (2004),1
5,34048,War of the Worlds (2005),50
6,64997,War of the Worlds (2005),2
7,144606,Confessions of a Dangerous Mind (2002),1
8,147002,Eros (2004),1
9,168358,Saturn 3 (1980),1


In [8]:
# Removing duplicated ids with low review count from movie database
movies = movies.loc[~movies['movieId'].isin(duplicated_ids)]
# Removing duplicated ids with low review count from rating database
ratings = ratings.loc[~ratings['movieId'].isin(duplicated_ids)]

In [9]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [10]:
combined = pd.merge(movies, ratings, on='movieId')
combined

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5
...,...,...,...,...,...
100825,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0
100826,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5
100827,193585,Flint (2017),Drama,184,3.5
100828,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5


In [11]:
pivot = combined.pivot_table(index='title',columns='userId',values='rating').fillna(0)
pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5
¡Three Amigos! (1986),4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
filename = 'pivot.sav'
joblib.dump(pivot, filename)

['pivot.sav']

In [13]:
movie_titles = pivot.reset_index()
movie_titles = movie_titles['title'].to_frame()
movie_titles

Unnamed: 0,title
0,'71 (2014)
1,'Hellboy': The Seeds of Creation (2004)
2,'Round Midnight (1986)
3,'Salem's Lot (2004)
4,'Til There Was You (1997)
...,...
9714,eXistenZ (1999)
9715,xXx (2002)
9716,xXx: State of the Union (2005)
9717,¡Three Amigos! (1986)


In [14]:
movie_titles.to_csv('movie_titles.csv', index=False)

In [15]:
def get_index(movie_name):
    record = movie_titles[movie_titles['title'] == movie_name]
    movie_index = list(record.index) 
    return movie_index[0]

In [16]:
yo = get_index('Karate Kid, Part II, The (1986)')
yo

4695

In [17]:
movie_matrix = csr_matrix(pivot.values)

In [18]:
filename = 'movie_matrix.sav'
joblib.dump(movie_matrix, filename)

['movie_matrix.sav']

In [19]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_jobs=-1)
start = time.time()
model_knn.fit(movie_matrix)
stop = time.time()
print(stop-start)

0.008259296417236328


In [20]:
filename = 'model_knn.sav'
joblib.dump(model_knn, filename)

['model_knn.sav']

In [24]:
def recommend():
    name = input("Enter a movie")

    idx=process.extractOne(name, movie_titles['title'])

    distances, indices = model_knn.kneighbors(pivot.iloc[idx[2],:].values.reshape(1, -1), n_neighbors = 20)
    
    print(idx)
    
    names = []
    distances = distances.flatten()
    
    for i in range(0, len(distances)):
        if i == 0:
            print('Recommendations for {0}:\n'.format(pivot.index[idx[2]]))
        else:
            names.append(pivot.index[indices.flatten()[i]])


    similar_movies = pd.DataFrame()
    similar_movies['Similar Movies'] = names
    similar_movies['distances'] = distances[1:len(distances)]
    return similar_movies

In [25]:
yo = recommend()
yo

Enter a movieAladdin (1992)
('Aladdin (1992)', 100, 298)
Recommendations for Aladdin (1992):



Unnamed: 0,Similar Movies,distances
0,Beauty and the Beast (1991),0.252944
1,"Lion King, The (1994)",0.282091
2,Jurassic Park (1993),0.386515
3,True Lies (1994),0.400094
4,Batman (1989),0.403279
5,Ace Ventura: Pet Detective (1994),0.416186
6,Mrs. Doubtfire (1993),0.424577
7,Die Hard: With a Vengeance (1995),0.431504
8,Batman Forever (1995),0.433616
9,Apollo 13 (1995),0.43385


In [17]:
def get_title_from_index(movie_id):
    title = movies[movies['movieId'] == movie_id]
    title = list(title['title'])
    return title[0]


def get_index_from_title(title):
    movie_id = movies[movies['title'] == title]
    movie_id = list(movie_id['movieId'])
    return movie_id[0]