In [1]:
# The objective is to show the top 250 movies on IMDb based on various factors

import pandas as pd
data=pd.read_csv('C://Users/Harvy/Downloads/tmdb_5000_movies.csv')

In [2]:
data.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [3]:
# Simply sorting movies on the basis of rating is not wise because ratings are prone to 
# fanboys and haters alike. So we devise a weighted rating based on the following observations
# m: Cut-off for the chart
# v: number of votes for the movie
# R: Average rating of the movie
# C: Mean vote across the whole report
# WR=(v*R+m*C)/(v+m)

C= data['vote_average'].mean()
print(C)

6.092171559442011


m= data['vote_count'].quantile(0.90
print(m)

In [4]:
m=data['vote_count'].quantile(0.75)

In [5]:
print(m)

737.0


In [6]:
movies= data.copy().loc[data['vote_count']>=m]
movies.shape

(1203, 20)

In [7]:
# Function to compute weighted rating
def weighted_rating(x,m=m,C=C):
    v=x['vote_count']
    R=x['vote_average']
    return (v/(v+m)*R)+(m/(v+m)*C)

In [8]:
movies['score']=movies.apply(weighted_rating,axis=1)
movies=movies.sort_values('score',ascending=False)

In [9]:
movies[['title','vote_count','vote_average','score']].head(15)

Unnamed: 0,title,vote_count,vote_average,score
1881,The Shawshank Redemption,8205,8.5,8.301547
3337,The Godfather,5893,8.4,8.143459
662,Fight Club,9413,8.3,8.139688
3232,Pulp Fiction,8428,8.3,8.122458
65,The Dark Knight,12002,8.2,8.078054
809,Forrest Gump,7927,8.2,8.020698
96,Inception,13752,8.1,7.997869
1818,Schindler's List,4329,8.3,7.978806
3865,Whiplash,4254,8.3,7.973979
95,Interstellar,10867,8.1,7.972478


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Remove all English stop words like the,a,..
tfidf=TfidfVectorizer(stop_words="english")
movies['overview']=movies['overview'].fillna(' ')
tfidf_matrix=tfidf.fit_transform(movies['overview'])

tfidf_matrix.shape

(1203, 9610)

In [11]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim=linear_kernel(tfidf_matrix,tfidf_matrix)

# Construct a reverse map of indices and movie titles
indices= pd.Series(movies.index,index=movies['title'].drop_duplicates())

In [12]:
def get_recommendations(title,cosine_sim=cosine_sim):
    idx=indices[title]
    sim_scores=list(enumerate(cosine_sim[idx]))
    sim_scores=sorted(sim_scores,key=lambda x: x[1],reverse=True)
    sim_scores=sim_scores[1:11]
    
    movie_indices=[i[0] for i in sim_scores]
    
    return movies['title'].iloc[movie_indices]

In [13]:
get_recommendations('The Dark Knight')

1578                             13 Going on 30
687                                         300
339                             The Incredibles
2501                        Hachi: A Dog's Tale
276     Harry Potter and the Chamber of Secrets
3232                               Pulp Fiction
1663                Once Upon a Time in America
1344                       Hot Tub Time Machine
2483                               Philadelphia
1850                                   Scarface
Name: title, dtype: object

In [14]:
# Let us use some more features to fine tune our recommendations
# Features like genre, production companies and information about the crew
# can be used

In [25]:
data['production_companies'][0]

'[{"name": "Ingenious Film Partners", "id": 289}, {"name": "Twentieth Century Fox Film Corporation", "id": 306}, {"name": "Dune Entertainment", "id": 444}, {"name": "Lightstorm Entertainment", "id": 574}]'

In [33]:
# A utility function to remove a given pattern from a string
# The function might help us later
import re
def remove_pattern(input_txt,pattern):
    r=re.findall(pattern,input_txt)
    
    for i in r:
        input_txt=re.sub(i,' ',input_txt)
    return input_txt    

In [31]:
# Step 1 in data cleaning: Keep only the alphabetic characters
# We don't need the numbers as well
import numpy as np
for i in data['production_companies'].index:
    string=data['production_companies'][i]
    data['production_companies'][i]=''.join(e for e in string if e.isalpha())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [32]:
data['production_companies'][0]

'nameIngeniousFilmPartnersidnameTwentiethCenturyFoxFilmCorporationidnameDuneEntertainmentidnameLightstormEntertainmentid'

In [34]:
# Step 2: remove all unnecessary strings like 'name','id'
# 'Entertainment' etc. which will play no part in the importance
# of the feature
for i in data['production_companies'].index:
    data['production_companies'][i]=data['production_companies'][i].replace('name',' ')
    data['production_companies'][i]=data['production_companies'][i].replace('id',' ')
    data['production_companies'][i]=data['production_companies'][i].replace('Entertainment','')
    data['production_companies'][i]=data['production_companies'][i].replace('Corporation','')
    data['production_companies'][i]=data['production_companies'][i].replace('Partners','')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [35]:
data['production_companies'][0]

' IngeniousFilm  TwentiethCenturyFoxFilm  Dune  Lightstorm '

In [36]:
# Let us do a similar treatment for 'Genre'
for i in data['genres'].index:
    string=data['genres'][i]
    data['genres'][i]=''.join(e for e in string if e.isalpha())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [37]:
for i in data['genres'].index:
    data['genres'][i]=data['genres'][i].replace('name',' ')
    data['genres'][i]=data['genres'][i].replace('id',' ')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
data['genres']=data['genres'].fillna(' ')
data['production_companies']=data['production_companies'].fillna(' ')

In [47]:
# Our data is now ready to be fed into the TFIDF Vectorizer
movies=data[['overview','production_companies','genres']]
movies['overview']=movies['overview'].fillna(' ')
tfidf_matrix1=tfidf.fit_transform(movies['overview'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [48]:
tfidf_matrix1.shape

(4803, 20978)

In [49]:
tfidf_matrix2=tfidf.fit_transform(movies['production_companies'])
tfidf_matrix2.shape

(4803, 4949)

In [50]:
tfidf_matrix3=tfidf.fit_transform(movies['genres'])
tfidf_matrix3.shape

(4803, 20)

In [68]:
movies['title']=data['title']
movies.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


(4803, 4)

In [66]:
# linear kernel is used to calculate cosine similarity
# We can directly calculate cosine similarity from tfidf matrix
# Linear kernel multiplies two column vectors using (xT)y
# i.e linear kernel calculates dot product of two column vectors
# We combine the similarity scores of all three features by taking average
from sklearn.metrics.pairwise import linear_kernel
cos_sim=linear_kernel(tfidf_matrix1,tfidf_matrix1)
cos_sim1=linear_kernel(tfidf_matrix2,tfidf_matrix2)
cos_sim2=linear_kernel(tfidf_matrix3,tfidf_matrix3)
cos_sim=[x+y for x,y in zip(cos_sim,cos_sim1)]
cos_sim_final=[x+y for x,y in zip(cos_sim,cos_sim2)]
for i in np.arange(len(cos_sim_final)):
    cos_sim_final[i]/=3

In [70]:
# Creating indices such that we can get index from title
indices= pd.Series(movies.index,index=movies['title'])

In [71]:
def get_recommendations(title,cosine_sim=cos_sim):
    idx=indices[title] # getting index from title
    sim_scores=list(enumerate(cosine_sim[idx])) # getting a list of the similarity score of this movie with all other movies
    sim_scores=sorted(sim_scores,key=lambda x: x[1],reverse=True)# sorting in 'most similar first' way 
    sim_scores=sim_scores[1:11] # taking top 10 recommendations
    
    movie_indices=[i[0] for i in sim_scores]
    
    return movies['title'].iloc[movie_indices]

In [72]:
get_recommendations('The Dark Knight')

3                         The Dark Knight Rises
119                               Batman Begins
14                                 Man of Steel
96                                    Inception
3854    Batman: The Dark Knight Returns, Part 2
9            Batman v Superman: Dawn of Justice
72                                Suicide Squad
10                             Superman Returns
1892                                 The Losers
95                                 Interstellar
Name: title, dtype: object

In [73]:
# As wee can see, the recommendations are tuned much better
# than the previous versions of the same system

In [75]:
get_recommendations("Schindler's List")

508          The Lost World: Jurassic Park
675                          Jurassic Park
793     The Flintstones in Viva Rock Vegas
2967            E.T. the Extra-Terrestrial
953              Gremlins 2: The New Batch
2911                              Gremlins
469                    The Legend of Zorro
2346                 Young Sherlock Holmes
1191                        Small Soldiers
1152            Back to the Future Part II
Name: title, dtype: object