In [2]:
# import libraries
import pandas as pd

In [3]:
# load data
df = pd.read_csv('movies_metadata.csv', low_memory=False)

# print data
df.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [6]:
df.shape

(45466, 24)

In [29]:
#***************************************************** Simple Recommendar System *************************************

# weighted rating
# formula: WeightedRating(WR)=((v/(v+m)) * R) + ((m/(v+m) * C)
    # v is the number of votes for the movie;
    # m is the minimum votes required to be listed in the chart;
    # R is the average rating of the movie;
    # C is the mean vote across the whole report.

In [30]:
# Calculate mean of vote average column
C = df.vote_average.mean()
print(C)

5.618207215133889


In [31]:
# calculate minimum numbers of votes required
m = df.vote_count.quantile(0.90)
m

160.0

In [12]:
# Filter out all movies with more than thresold
df_movies = df.copy().loc[df['vote_count'] > m]
df_movies.shape

(4538, 24)

In [13]:
# create function to calculate weighted rating for each movie
def weightedRating(df, m=m, C=C):
    v = df['vote_count']
    R = df['vote_average']
    
    # calculate weight based on IMDB formula
    return (v/(v+m) * R) + (m/(v+m) * C)

In [16]:
# add new column as Score (weighted rating)
df_movies['score'] = df_movies.apply(weightedRating, axis = 1)
df_movies['score']

0        7.640253
1        6.820293
4        5.660700
5        7.537201
8        5.556626
           ...   
45174    6.476267
45204    6.671272
45258    6.590372
45265    6.344369
45343    4.791783
Name: score, Length: 4538, dtype: float64

In [24]:
# sort movies based on score
df_movies = df_movies.sort_values('score', ascending = False)

# print movies
df_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


In [32]:
#***************************************************** Content-Based Recommendar System *************************************
# 1. Based on movie Description
# 2. Based on Case, crew, director, genre

In [4]:
# print movie description
df.overview.head(5)

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [5]:
# import libraries
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
# define a TF-IDF Vectorizer object and remove stop words
tfidf = TfidfVectorizer(stop_words = 'english')

In [7]:
# replate NaN with empty string
df['overview'] = df['overview'].fillna('')

In [8]:
# construct TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape

(45466, 75827)

In [9]:
print(tfidf_matrix)

  (0, 17764)	0.13483149538639247
  (0, 4388)	0.1474882034218405
  (0, 38030)	0.10142919482788751
  (0, 21887)	0.10438761058719498
  (0, 19641)	0.13281884272823927
  (0, 48558)	0.10339358185033234
  (0, 59519)	0.13008016104455086
  (0, 12490)	0.12544427954397822
  (0, 51108)	0.13434817283119177
  (0, 29238)	0.10093917370354445
  (0, 50914)	0.09190797940163035
  (0, 39423)	0.11907123344715953
  (0, 1847)	0.140911774178889
  (0, 58571)	0.1135591886873686
  (0, 38693)	0.20627924682810617
  (0, 9874)	0.5028038686135609
  (0, 9087)	0.10635375129287977
  (0, 7491)	0.12380553184830104
  (0, 56872)	0.111248510865236
  (0, 28729)	0.13311522181618415
  (0, 39012)	0.08718689178959059
  (0, 67874)	0.14878284660693247
  (0, 3159)	0.41178365711725945
  (0, 73468)	0.4809827114790237
  (0, 38088)	0.10739705953465473
  :	:
  (45464, 26957)	0.07350962631701621
  (45464, 18919)	0.09271509240923419
  (45464, 18119)	0.07466631763708827
  (45464, 39012)	0.06829617779135382
  (45465, 16520)	0.3237330788694511

In [10]:
tfidf.get_feature_names()[5000:5010]

['avails',
 'avaks',
 'avalanche',
 'avalanches',
 'avallone',
 'avalon',
 'avant',
 'avanthika',
 'avanti',
 'avaracious']

In [11]:
# cosine similarity score
# import linear kernel
from sklearn.metrics.pairwise import linear_kernel

# compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
cosine_sim.shape

(45466, 45466)

In [15]:
cosine_sim[1]

array([0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
       0.00929411])

In [16]:
# construct a reverse mapping of indices and movie title
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [17]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [20]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [21]:
get_recommendations('The Dark Knight Rises')

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object

In [None]:
# 2. Based on cast, crew, director and Keywords
