# Content Based Recommender

In [3]:
#Followed this tutorial: https://www.datacamp.com/community/tutorials/recommender-systems-python
#build a system that recommends movies that are similar to a particular movie

In [4]:
import pandas as pd

#load movies metadata
metadata = pd.read_csv("../datasets/contentbasedrecdata/movies_metadata.csv", low_memory = False)

#print first three rows
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [5]:
metadata = metadata.truncate(after=30000)

In [6]:
#plot description is available as the 'overview' feature

#print plot overviews of the first 5 movies
metadata['overview'].head()

#NLP problem. Need to etract features from the overview tets before computing similarity. It is not possible to compute
#similarity between any two overviews in their raw forms. Need to compute word vectors 

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

### term frequency - inverse document frequency (TF-IDF)

In [7]:
#word vectors should carry some kind of semantic meaning
#TF-IDF will return matrix where each column reprets a word in the overview vocabulary
#and each row(?) represents a movie


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define a TF-IDF vectorizer object. remove all english stop wrods such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#replace NaN values with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

tfidf_matrix.shape

(30001, 58563)

In [9]:
#from above output, there are 75,827 different vocabularies or words in dataset of 45,466 movies

### now compute cosine similarity score

In [10]:
#since we have already vectorized the matrix, we can directly take the dot product to find the cosine similarity
#thus, we can use sklearn's linear_kernel() instead of cosine_similarities() since it is faster


In [11]:
from sklearn.metrics.pairwise import linear_kernel

#compute the cosine similarity matrix, this is computationally expensive
#returns each movie's cosine similarity score with every other movie based on the overview
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
cosine_sim.shape

(30001, 30001)

In [13]:
cosine_sim[0]

array([1.        , 0.01561361, 0.        , ..., 0.        , 0.        ,
       0.        ])

### recommendation function

In [14]:
#define function that takes in a movie title as an input and outputs a list of 10 most similar movies
#first need to reverse map movie titles and dataframe indices, mechanism to identify the index of a movie in metadata dataframe

#construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [15]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [16]:
#function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim = cosine_sim):
    #get index of movie that matches the title
    idx = indices[title]
    
    #get pairwise similarity scores of all movies with that title
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #sort the scores of the 10 most similar movies
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    #get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    #return the top 10 most similar movies 
    return metadata['title'].iloc[movie_indices]

In [17]:
get_recommendations('The Dark Knight Rises')

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object

### credits, genres, keywords based recommender

In [18]:
#quality of recommender would be increased with usage of better metadata
#load keywords and credits

credits = pd.read_csv('../datasets/contentbasedrecdata/credits.csv')
keywords = pd.read_csv('../datasets/contentbasedrecdata/keywords.csv')

In [19]:
#remove rows with bad IDs
metadata = metadata.drop([19730, 29503])

In [20]:
#convert IDs to int, which is required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')


In [21]:
#merge keywords and creits into main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [22]:
#data is present in the form of 'stringified' lists, need to convert them into a way that is usable

#parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature]=metadata[feature].apply(literal_eval)

### get director

In [23]:
import numpy as np

#get the director's name from the crew feature. if the director is not listed return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

### return the top 3 elements of cast, keywords, and genres

In [44]:
metadata.loc[0:5, 'cast' ]

0    [T, T, D]
1    [R, J, K]
2    [W, J, A]
3    [W, A, L]
4    [S, D, M]
5    [A, R, V]
Name: cast, dtype: object

In [78]:
def get_list(x):
    if isinstance(x, list):
        names = [x[1] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [79]:
# Define new director, cast, genres and keywords features that are in a suitable form.
#metadata['director'] = metadata['crew'].apply(get_director)

# features = ['cast', 'keywords', 'genres']
# for feature in features:
#     metadata[feature] = metadata[feature].apply(get_list)



metadata['keywords'] = metadata['keywords'].apply(get_list)

metadata['genres'] = metadata['genres'].apply(get_list)


IndexError: list index out of range

In [80]:
metadata['cast'].head()

0    [T, T, T]
1    [R, R, R]
2    [W, W, W]
3    [W, W, W]
4    [S, S, S]
Name: cast, dtype: object

In [77]:
metadata['title'].head()

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

In [97]:
metadata['keywords'].head()

0                      [jealousy, jealousy, jealousy]
1                [board game, board game, board game]
2                         [fishing, fishing, fishing]
3    [based on novel, based on novel, based on novel]
4                                  [baby, baby, baby]
Name: keywords, dtype: object

In [96]:
metadata['genres'].head()
metadata['keywords'][0]

['jealousy', 'jealousy', 'jealousy']

In [59]:
metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,"[T, T, T]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 931, ...",John Lasseter
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[R, R, R]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",Joe Johnston
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[W, W, W]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 1495,...",Howard Deutch
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[W, W, W]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",Forest Whitaker
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[S, S, S]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1009, 'n...",Charles Shyer


In [94]:
for i,d in metadata.iterrows():
    keywordArr=[]
    for x in d['keywords']:
        keywordArr.append(x.get('name',None))
    metadata.at[i,'keywords']= keywordArr

['jealousy', 'jealousy', 'jealousy']


AttributeError: 'str' object has no attribute 'get'