In [2]:
import pandas as pd
import numpy as np

# Read data from the CSV file into the variable df
df = pd.read_csv('..\data\metadata_clean.csv')

# Display the data in tabular format
df.head()


Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995


# Plot description-based recommender

In [4]:
#Read data from the original CSV file into the variable orig_df
orig_df = pd.read_csv('..\data\movies_metadata.csv',low_memory=False)

#Select the 'overview' and 'id' columns
df['overview'], df['id'] = orig_df['overview'], orig_df['id']
df.head()


Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


In [5]:
#Import TfIdfVectorizer from scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer
#Create a TF-IDF Vectorizer Object and remove English Stopwords
tfidf = TfidfVectorizer(stop_words='english')
#Convert NaN to Empty String
df['overview'] = df['overview'].fillna('')
#Create a TF-IDF matrix using the fit_transform method with data from the 'overview' column
tfidf_matrix = tfidf.fit_transform(df['overview'])
#Show Shape of tfidf_matrix
tfidf_matrix.shape

(45466, 75827)

In [6]:
#Import linear_kernel to perform calculations
from sklearn.metrics.pairwise import linear_kernel
#Create the Cosine Similarity Matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
#Create Reverse Index for Movie
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [10]:
def content_recommender(title, cosine_sim=cosine_sim, df=df,indices=indices):
    #Read the index of the movie based on the movie name provided in the function
    idx = indices[title]
    
    #Read the similarity scores between the movie provided and other movies, 
    #then convert them into a list to store the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #Sort the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #Read the scores of the top 10 highest similarity scores, excluding the first entry
    sim_scores = sim_scores[1:11]
    
    #Read the index values of the top 10 most similar movies
    movie_indices = [i[0] for i in sim_scores]
    
    #Send the names of the top 10 movies
    return df['title'].iloc[movie_indices]

In [12]:
content_recommender('Toy Story')

15348                                     Toy Story 3
2997                                      Toy Story 2
10301                          The 40 Year Old Virgin
24523                                       Small Fry
23843                     Andy Hardy's Blonde Trouble
29202                                      Hot Splash
43427                Andy Kaufman Plays Carnegie Hall
38476    Superstar: The Life and Times of Andy Warhol
42721    Andy Peters: Exclamation Mark Question Point
8327                                        The Champ
Name: title, dtype: object

# Metadata-based recommender

In [13]:
#Load keywords and credits files
cred_df = pd.read_csv('..\data\credits.csv')
key_df = pd.read_csv('..\data\keywords.csv')

In [14]:
#Display cred_df in tabular format
cred_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [15]:
#Display key_df in tabular format
key_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [16]:
#Function that will convert non-integer IDs to NaN
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan
    
#Clean ids from df
df['id'] = df['id'].apply(clean_ids)

#Filter all rows that have null ID
df = df[df['id'].notnull()]

In [17]:
# Convert IDs to integer
df['id'] = df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cred_df['id'] = cred_df['id'].astype('int')
#Merge keywords and credits into the main dataframe
df = df.merge(cred_df, on='id')
df = df.merge(key_df, on='id')
#Display the merged dataframe df in tabular format
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = df['id'].astype('int')


Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id,cast,crew,keywords
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [19]:
#Convert stringified objects to Python objects
from ast import literal_eval
#Create an array of features to store data and insert it into the DataFrame
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)
#Print the information of the first cast member of the first movie in the DataFrame
df.iloc[0]['crew'][0]


{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [22]:
#Extract the name of the director. If not available, return NaN
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

#Display the information of directors for the first 5 movies
df['director'] = df['crew'].apply(get_director)

#Assign the 'director' feature to the DataFrame
df['director'].head()

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
Name: director, dtype: object

In [23]:
#A function that returns a list of 3 elements or all values
def generate_list(x):
    if isinstance(x, list):
        names = [ele['name'] for ele in x]
        
        #Check if there are more than 3 elements. If true, take only the first 3 elements
        #if false, take all elements
        if len(names) > 3:
            names = names[:3]
        return names
    
    #Return an empty list in case of missing or malformed data
    return []


In [24]:
#Use the generate_list function with the cast and keywords data
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)

#Use only the first 3 genres data
df['genres'] = df['genres'].apply(lambda x: x[:3])

#Display the feature data of 5 movies along with their movie names
df[['title', 'cast', 'director', 'keywords', 'genres']].head(5)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"
3,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,"[baby, midlife crisis, confidence]",[Comedy]


In [25]:
#A function that removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        #Remove spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if there is director data. If not, display it as an empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [26]:
#Invoke the sanitize function with cast, keywords, director, and genres
for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [29]:
#A function that combines all features together
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
    
#Create soup in DataFrame
df['soup'] = df.apply(create_soup, axis=1)

#Test by displaying the soup data of the first movie
df.iloc[0]['soup']


'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

In [30]:
#Import CountVectorizer from scikit-learn library
from sklearn.feature_extraction.text import CountVectorizer

#Crate CountVectorizer object and vectors for soup
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

#Import cosine_similarity function
from sklearn.metrics.pairwise import cosine_similarity

#Calculate cosine similarity score
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

#Reset index of df and create index again
df = df.reset_index()
indices2 = pd.Series(df.index, index=df['title'])

#Call content_recommender function
content_recommender('The Lion King', cosine_sim2, df, indices2)

29607                                          Cheburashka
40904                   VeggieTales: Josh and the Big Wall
40913    VeggieTales: Minnesota Cuke and the Search for...
27768                                 The Little Matchgirl
15209             Spiderman: The Ultimate Villain Showdown
16613                            Cirque du Soleil: Varekai
24654                                  The Seventh Brother
29198                                      Superstar Goofy
30244                                              My Love
31179                Pokémon: Arceus and the Jewel of Life
Name: title, dtype: object