In [1]:
import os,sys
import pandas as pd
sys.path.append(os.path.normpath(os.getcwd()))

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
metadata = pd.read_csv('~/Downloads/Movie_Bot/data/metadata_prep.csv')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(45068, 75551)

In [5]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
def get_recommendations(title, df, indices, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 3 most similar movies
    sim_scores = sim_scores[1:4]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 3 most similar movies
    return df['title'].iloc[movie_indices]

In [7]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [8]:
%%timeit
get_recommendations('Toy Story 2', metadata, indices, cosine_sim)

10 loops, best of 3: 25.3 ms per loop


In [9]:
a = get_recommendations('Toy Story 2', metadata, indices, cosine_sim)

In [10]:
metadata.loc[a.index]['imdbURL']

0        https://www.imdb.com/title/tt0114709/
15300    https://www.imdb.com/title/tt0435761/
25551    https://www.imdb.com/title/tt0105410/
Name: imdbURL, dtype: object

In [11]:
metadata.loc[a.index]['ImageURL']

0        https://image.tmdb.org/t/p/w92/rhIRbceoE9lR4ve...
15300    https://image.tmdb.org/t/p/w92/mMltbSxwEdNE4Cv...
25551    https://image.tmdb.org/t/p/w92/6UfPS7KbcBury0T...
Name: ImageURL, dtype: object