In [60]:
# Import Pandas
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from ast import literal_eval
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Load Movies Metadata
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

# Print the first three rows
print(metadata.head(3))
# just for test

metadata = metadata.head(20000)
# ///////////////////////////////
# Calculate mean of vote average column
C = metadata['vote_average'].mean()
print(C)

# Calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)
print(m)

# Filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
print(q_movies.shape)


# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)


# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)


# Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

# Print the top 15 movies
print(q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20))

# Print plot overviews of the first 5 movies.
print(metadata['overview'].head())


# Import TfIdfVectorizer from scikit-learn

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

# Output the shape of tfidf_matrix
print(tfidf_matrix.shape)


# Array mapping from feature integer indices to feature name.
print(tfidf.get_feature_names_out()[5000:5010])


# Import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)


# Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

print(indices[:10])


# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]


print(get_recommendations('Toy Story', cosine_sim))


# Load keywords and credits
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

# Remove rows with bad IDs.
metadata = metadata.drop([19730])

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

# Print the first two movies of your newly merged metadata
print(metadata.head(2))
# Parse the stringified features into their corresponding python objects

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

# Import Numpy


def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        # Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    # Return empty list in case of missing/malformed data
    return []


# Define new director, cast, genres and keywords features that are in a suitable form.
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

# Print the new features of the first 3 films
print(metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3))
# Function to convert all strings to lower case and strip names of spaces


def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)


def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

print(metadata[['soup']].head(2))

# Import CountVectorizer and create the count matrix

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

# Compute the Cosine Similarity matrix based on the count_matrix

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

print(get_recommendations('The Dark Knight Rises', cosine_sim2))


   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844  tt0113497                en   
2                                   NaN  15602  tt0113228                en   

     original_title                                           overview  ...  \
0         Toy Story  Led by Woody, Andy's toys live happily in his ...  ...   
1      

In [40]:
import pandas as pd

# Load Movies Metadata
# metadata = pd.read_csv('./movies_metadata.min.csv', low_memory=False,nrows=2500)
metadata = pd.read_csv('./movies_metadata.csv', low_memory=False)
metadata=metadata.head(2000)
#Print plot overviews of the first 5 movies.
print(metadata.head())

   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844  tt0113497         

In [41]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
# From the above output, you observe that 75,827 different vocabularies or words in your dataset have 45,000 movies.
print(tfidf_matrix.shape)

#Array mapping from feature integer indices to feature name. /  it s a biig array !!
tfidf.get_feature_names()[930:1010]

(2000, 13918)




['assist',
 'assistance',
 'assistant',
 'assistants',
 'assisted',
 'associate',
 'associated',
 'associates',
 'association',
 'assorted',
 'assumed',
 'assumes',
 'assuming',
 'assured',
 'asta',
 'asteroid',
 'astonishing',
 'astoria',
 'astounding',
 'astra',
 'astray',
 'astrid',
 'astro',
 'astronaut',
 'astronauts',
 'astronomer',
 'astronomy',
 'astrophysicist',
 'asylum',
 'atf',
 'ath',
 'athlete',
 'athletes',
 'athletic',
 'athos',
 'atlanta',
 'atlantic',
 'atlas',
 'atmosphere',
 'atmospheric',
 'atoll',
 'atom',
 'atomic',
 'atop',
 'atrocities',
 'atrocity',
 'attachment',
 'attack',
 'attacked',
 'attacking',
 'attacks',
 'attained',
 'attempt',
 'attempted',
 'attempting',
 'attempts',
 'attenborough',
 'attend',
 'attendant',
 'attendants',
 'attended',
 'attendees',
 'attending',
 'attends',
 'attention',
 'attentions',
 'attentive',
 'attic',
 'atticus',
 'attired',
 'attitude',
 'attorney',
 'attorneys',
 'attract',
 'attracted',
 'attracting',
 'attraction',
 'a

In [42]:

# Since you have used the TF-IDF vectorizer, calculating the dot product between 
# each vector will directly give you the cosine similarity score. Therefore, 
# you will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
# This would return a matrix of shape 45466x45466, which means each movie overview cosine similarity 
# score with every other movie overview. Hence, 
# each movie will be a 1x45466 column vector where each column will be a similarity score with each movie.
try:
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    print(cosine_sim.shape)
    print(cosine_sim[1])
except:
    print("Error Given : MemoryError: Unable to allocate 4.07 GiB for an array with shape (546860044,) and data type float64")
finally:
    pass


(2000, 2000)
[0.01729311 1.         0.04843596 ... 0.         0.         0.01222168]


In [43]:
# function that takes in a movie title as an input and outputs a list of the 10 most similar movies
# wee need first : a mechanism to identify the index of a movie in your metadata DataFrame, given its title.
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()
print(indices[:10])
# Exactly the number of movies we have
print(indices.shape)

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64
(2000,)


In [59]:
# Recommender Function !!!!!!
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the given title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie cad
    # Get the list of cosine similarity scores for that particular movie with
    # all movies. Convert it into a list of tuples where the first element is
    #  its position, and the second is the similarity score.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores cad
    # Sort the aforementioned list of tuples based on the similarity scores; that is, the second element.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies cad
    # Get the top 10 elements of this list. Ignore the first element as 
    # it refers to self (the movie most similar to a particular movie is the movie itself).
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies cad
    # Return the titles corresponding to the indices of the top elements.
    return metadata['title'].iloc[movie_indices]

try:
    print(get_recommendations('The Godfather'))
except:
    print("Godfather Does Not Exist")
try:
    print(get_recommendations('The Dark Knight Rises'))
except:
    print(" Dark Knight Rises Does Not Exist")

183                                          Nine Months
1379                              The Portrait of a Lady
1649                                 The Sweet Hereafter
1415                                      Absolute Power
1725                                 Two Girls and a Guy
1703    A Paralyzing Fear: The Story of Polio in America
445                                          With Honors
1078                          E.T. the Extra-Terrestrial
391                                            Fall Time
206                                   White Man's Burden
Name: title, dtype: object
183                                          Nine Months
1379                              The Portrait of a Lady
1649                                 The Sweet Hereafter
1415                                      Absolute Power
1725                                 Two Girls and a Guy
1703    A Paralyzing Fear: The Story of Polio in America
445                                          With Honors
1078

In [45]:
# To have a great Precision , we will take on consideration more features :
# Credits, Genres, and Keywords Based Recommender
credits = pd.read_csv('./credits.csv')
keywords = pd.read_csv('./keywords.csv')

# Remove rows with bad IDs.
try:
    metadata = metadata.drop([19730, 29503, 35587])
except:
    print("Ids Already removed : metadata.drop([19730, 29503, 35587])")
# metadata = metadata.drop([19730])

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

# Print the first two movies of your newly merged metadata
metadata.head(2)


Ids Already removed : metadata.drop([19730, 29503, 35587])


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [46]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
# features = ['cast_y', 'crew_y', 'keywords_y', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)


In [47]:
# Import Numpy
import numpy as np
# Get the director's name from the crew feature
# f the director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [48]:
# function that will return the top 3 elements or the entire list, whichever is more. 
# Here the list refers to the cast, keywords, and genres.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [49]:
# Define new director, cast, genres and keywords features that are in a suitable form.
# metadata['director'] = metadata['crew_y'].apply(get_director)

# features = ['cast_y', 'keywords_y', 'genres']
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

In [50]:
# Print the new features of the first 3 films
# metadata[['title', 'cast_y', 'director', 'keywords_y', 'genres']].head(3)
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [51]:
# It is done so that your vectorizer doesn't count the Johnny of "Johnny Depp"
#  and "Johnny Galecki" as the same. After this processing step, 
# the aforementioned actors will be represented as "johnnydepp" and "johnnygalecki" 
# and will be distinct to your vectorizer.
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [52]:
# Apply clean_data function to your features.
# features = ['cast_y', 'keyword_y', 'director', 'genres']
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

In [53]:
#"metadata soup" is a string that contains all the metadata that you want to feed 
# to your vectorizer (namely actors, director and keywords).

# The create_soup function will simply join all the required columns by a space.
#  This is the final preprocessing step, and the output of this function will 
# be fed into the word vector model.
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

metadata[['soup']].head(2)

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...


In [54]:
# plot description based recommender. One key difference is that you use the CountVectorizer()
#  instead of TF-IDF. This is because you do not want to down-weight the actor/director's 
# presence if he or she has acted or directed in relatively more movies. It doesn't make
#  much intuitive sense to down-weight them in this context.

# The major difference between CountVectorizer() and TF-IDF is the inverse document frequency (IDF)
#  component which is present in later and not in the former.

# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])
# 73,881 vocabularies in the metadata that you fed to it.
count_matrix.shape

(2012, 6343)

In [55]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [56]:
# get_recommendations() function by passing in the new cosine_sim2 matrix as your second argument.
get_recommendations('The Dark Knight Rises', cosine_sim2)

KeyError: 'The Dark Knight Rises'