In [95]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from ast import literal_eval

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
metadata = pd.read_csv('../Datasets/the-movies-dataset/movies_metadata.csv', nrows=10000)

In [3]:
metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173


In [23]:
metadata.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 24 columns):
adult                    10000 non-null bool
belongs_to_collection    1421 non-null object
budget                   10000 non-null int64
genres                   10000 non-null object
homepage                 662 non-null object
id                       10000 non-null int64
imdb_id                  9999 non-null object
original_language        10000 non-null object
original_title           10000 non-null object
overview                 9971 non-null object
popularity               10000 non-null float64
poster_path              9969 non-null object
production_companies     10000 non-null object
production_countries     10000 non-null object
release_date             9995 non-null object
revenue                  10000 non-null int64
runtime                  9994 non-null float64
spoken_languages         10000 non-null object
status                   9992 non-null object
tagline    

In [5]:
metadata.loc[1]

adult                                                                False
belongs_to_collection                                                  NaN
budget                                                            65000000
genres                   [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
homepage                                                               NaN
id                                                                    8844
imdb_id                                                          tt0113497
original_language                                                       en
original_title                                                     Jumanji
overview                 When siblings Judy and Peter discover an encha...
popularity                                                         17.0155
poster_path                               /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
production_companies     [{'name': 'TriStar Pictures', 'id': 559}, {'na...
production_countries     

In [6]:
metadata.loc[1][3]

"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]"

In [7]:
metadata.iloc[1][3]

"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]"

In [12]:
#Function to convert to float manually
def to_float(x):
    try:
        x = float(x)
    except: 
        x = np.nan
    return x

In [24]:
metadata.budget = metadata.budget.apply(to_float)

In [25]:
metadata.release_date = pd.to_datetime(metadata.release_date, errors='coerce')

In [26]:
#Extract year from the datetime
metadata['year'] = metadata['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [31]:
metadata[['title','release_date', 'year']].sort_values(by= 'year', ascending=True).head(10)

Unnamed: 0,title,release_date,year
9952,A Trip to the Moon,1902-09-01,1902
6913,The Birth of a Nation,1915-02-08,1915
8293,The Cheat,1915-12-13,1915
7090,Intolerance: Love's Struggle Throughout the Ages,1916-09-04,1916
7817,The Immigrant,1917-06-17,1917
3191,A Dog's Life,1918-04-14,1918
2708,The Spiders - The Golden Sea,1919-10-03,1919
3015,Daddy-Long-Legs,1919-05-11,1919
6836,Broken Blossoms,1919-05-13,1919
2706,Male and Female,1919-11-23,1919


In [33]:
metadata[['title','revenue','year']].sort_values(by= 'revenue', ascending=False).head(10)

Unnamed: 0,title,revenue,year
1639,Titanic,1845034188,1997
7000,The Lord of the Rings: The Return of the King,1118888979,2003
4766,Harry Potter and the Philosopher's Stone,976475550,2001
6232,Finding Nemo,940335536,2003
5814,The Lord of the Rings: The Two Towers,926287400,2002
2514,Star Wars: Episode I - The Phantom Menace,924317558,1999
475,Jurassic Park,920100000,1993
7717,Shrek 2,919838758,2004
5678,Harry Potter and the Chamber of Secrets,876688482,2002
4863,The Lord of the Rings: The Fellowship of the Ring,871368364,2001


In [36]:
metadata.year.value_counts(ascending=False).head()

2002    492
2001    448
2000    433
2003    422
1998    401
Name: year, dtype: int64

In [41]:
metadata.year.dtype

dtype('O')

In [43]:
def to_int(x):
    try:
        x = int(x)
    except: 
        x = np.nan
    return x

metadata.year = metadata.year.apply(to_int)

In [45]:
#Convert all NaN into stringified empty lists
metadata['genres'] = metadata['genres'].fillna('[]')

#Apply literal_eval to convert stringified empty lists to the list object
metadata['genres'] = metadata['genres'].apply(literal_eval)

#Convert list of dictionaries to a list of strings
metadata['genres'] = metadata['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])

In [46]:
metadata.genres.head()

0     [animation, comedy, family]
1    [adventure, fantasy, family]
2               [romance, comedy]
3        [comedy, drama, romance]
4                        [comedy]
Name: genres, dtype: object

#### Knowledge based recommender

In [54]:
def build_chart(gen_df, percentile=0.8):
    #Ask for preferred genres
    print("Input preferred genre")
    genre = input()
    
    #Ask for lower limit of duration
    print("Input shortest duration")
    low_time = int(input())
    
    #Ask for upper limit of duration
    print("Input longest duration")
    high_time = int(input())
    
    #Ask for lower limit of timeline
    print("Input earliest year")
    low_year = int(input())
    
    #Ask for upper limit of timeline
    print("Input latest year")
    high_year = int(input())
    
    #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies
    movies = gen_df.copy()
    
    #Create a new feature by exploding genres
    s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)

    #Name the new feature as 'genre'
    s.name = 'genre'

    #Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.
    movies = movies.drop('genres', axis=1).join(s)
    
    #Filter based on the condition
    movies = movies[(movies['genre'] == genre) & 
                    (movies['runtime'] >= low_time) & 
                    (movies['runtime'] <= high_time) & 
                    (movies['year'] >= low_year) & 
                    (movies['year'] <= high_year)]
    
    #Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    
    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    
    #Calculate score using the IMDB formula
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                       + (m/(m+x['vote_count']) * C)
                                       ,axis=1)

    #Sort movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)
    
    return q_movies

In [55]:
#Generate the chart for top animation movies and display top 5.
build_chart(metadata).head()

Input preferred genre
animation
Input shortest duration
60
Input longest duration
300
Input earliest year
1990
Input latest year
2000


Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre,score
359,False,"{'id': 94032, 'name': 'The Lion King Collectio...",45000000.0,http://movies.disney.com/the-lion-king,8587,tt0110357,en,The Lion King,A young lion cub named Simba can't wait to be ...,21.605761,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Life's greatest adventure is finding your plac...,The Lion King,False,8.0,5520,1994.0,animation,7.761959
2884,False,,26500000.0,,128,tt0119698,ja,もののけ姫,"Ashitaka, a prince of the disappearing Ainu tr...",17.166725,...,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,The Fate Of The World Rests On The Courage Of ...,Princess Mononoke,False,8.2,2041,1997.0,animation,7.630454
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415,1995.0,animation,7.510795
588,False,"{'id': 153010, 'name': 'Beauty and the Beast C...",25000000.0,http://disney.go.com/disneyvideos/animatedfilm...,10020,tt0101414,en,Beauty and the Beast,"Follow the adventures of Belle, a bright young...",23.433511,...,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,The most beautiful love story ever told.,Beauty and the Beast,False,7.5,3029,1991.0,animation,7.25795
546,False,,18000000.0,,9479,tt0107688,en,The Nightmare Before Christmas,Tired of scaring humans every October 31 with ...,17.730913,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A ghoulish tale with wicked humor & stunning a...,The Nightmare Before Christmas,False,7.6,2135,1993.0,animation,7.25707


#### Content based recommender

In [57]:
df = metadata[['title' ,'genres','runtime','vote_average','vote_count','year','overview','id']]

In [59]:
#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(df['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


(10000, 32350)

In [61]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [62]:
#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [63]:
# Function that takes in movie title as input and gives recommendations 
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [64]:
#Get recommendations for The Lion King
content_recommender('The Lion King')

9353                  The Lion King 1½
9115    The Lion King 2: Simba's Pride
3203                  The Waiting Game
6094                         Born Free
6574     Once Upon a Time in China III
2779             Napoleon and Samantha
892                   The Wizard of Oz
3293                          The Bear
2094                 Shadow of a Doubt
5863                The King of Comedy
Name: title, dtype: object

In [65]:
# Load the keywords and credits files
credits = pd.read_csv('../Datasets/the-movies-dataset/credits.csv' ,nrows=10000)
keywords = pd.read_csv('../Datasets/the-movies-dataset/keywords.csv')

In [66]:
display(credits.head())
display(keywords.head())

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [67]:
df['id'] = df['id'].apply(to_int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [69]:
#Filter all rows that have a null ID
#df = df[df['id'].notnull()]

df.dropna(subset=['id'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [70]:
df.shape

(10000, 8)

In [72]:
# Convert IDs into integer
df['id'] = df['id'].astype('int')
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
df = df.merge(credits, on='id')
df = df.merge(keywords, on='id')

#Display the head of df
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id,cast,crew,keywords
0,Toy Story,"[animation, comedy, family]",81.0,7.7,5415,1995.0,"Led by Woody, Andy's toys live happily in his ...",862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,Jumanji,"[adventure, fantasy, family]",104.0,6.9,2413,1995.0,When siblings Judy and Peter discover an encha...,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,Grumpier Old Men,"[romance, comedy]",101.0,6.5,92,1995.0,A family wedding reignites the ancient feud be...,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,Waiting to Exhale,"[comedy, drama, romance]",127.0,6.1,34,1995.0,"Cheated on, mistreated and stepped on, the wom...",31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,Father of the Bride Part II,[comedy],106.0,5.7,173,1995.0,Just when George Banks has recovered from his ...,11862,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [81]:
#features = ['cast', 'crew', 'keywords']
#for feature in features:
#    df[feature] = df[feature].apply(literal_eval)

#df.crew = df.crew.apply(literal_eval)

In [78]:
df.iloc[0]['crew'][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [79]:
# Extract the director's name. If director is not listed, return NaN
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [80]:
#Define the new director feature
df['director'] = df['crew'].apply(get_director)

#Print the directors of the first five movies
df['director'].head()

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
Name: director, dtype: object

In [82]:
# Returns the list top 3 elements or entire list; whichever is more.
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [83]:
#Apply the generate_list function to cast and keywords
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)

In [84]:
#Only consider a maximum of 3 genres
df['genres'] = df['genres'].apply(lambda x: x[:3])

In [85]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id,cast,crew,keywords,director
0,Toy Story,"[animation, comedy, family]",81.0,7.7,5415,1995.0,"Led by Woody, Andy's toys live happily in his ...",862,"[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy]",John Lasseter
1,Jumanji,"[adventure, fantasy, family]",104.0,6.9,2413,1995.0,When siblings Judy and Peter discover an encha...,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board game, disappearance, based on children'...",Joe Johnston
2,Grumpier Old Men,"[romance, comedy]",101.0,6.5,92,1995.0,A family wedding reignites the ancient feud be...,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best friend, duringcreditsstinger]",Howard Deutch
3,Waiting to Exhale,"[comedy, drama, romance]",127.0,6.1,34,1995.0,"Cheated on, mistreated and stepped on, the wom...",31357,"[Whitney Houston, Angela Bassett, Loretta Devine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based on novel, interracial relationship, sin...",Forest Whitaker
4,Father of the Bride Part II,[comedy],106.0,5.7,173,1995.0,Just when George Banks has recovered from his ...,11862,"[Steve Martin, Diane Keaton, Martin Short]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife crisis, confidence]",Charles Shyer


In [86]:
# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [87]:
#Apply the generate_list function to cast, keywords, director and genres
for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [88]:
#Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [89]:
# Create the new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [91]:
df.soup.loc[1]

"boardgame disappearance basedonchildren'sbook robinwilliams jonathanhyde kirstendunst joejohnston adventure fantasy family"

In [93]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [94]:
count_matrix

<10028x20420 sparse matrix of type '<class 'numpy.int64'>'
	with 85606 stored elements in Compressed Sparse Row format>

In [96]:
#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [97]:
# Reset index of your df and construct reverse mapping again
df = df.reset_index()
indices2 = pd.Series(df.index, index=df['title'])

In [98]:
content_recommender('The Lion King', cosine_sim2, df, indices2)

3318                Creature Comforts
3479                     Time Masters
3706    Thomas and the Magic Railroad
7036                    Teacher's Pet
1006              So Dear to My Heart
2769                       Thumbelina
4919            The Flight of Dragons
1636                 Ill Gotten Gains
3469       Jails, Hospitals & Hip-Hop
651         James and the Giant Peach
Name: title, dtype: object