In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies = movies.merge(credits, on = 'title')

# Demographic Filtering-
They offer generalized recommendations to every user, based on movie popularity and/or genre. The System recommends the same movies to users with similar demographic features. Since each user is different , this approach is considered to be too simple. The basic idea behind this system is that movies that are more popular and critically acclaimed will have a higher probability of being liked by the average audience.

In [4]:
movies.dropna(inplace = True)

I have used the TMDB Ratings to come up with our Top Movies Chart. I have used IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:

Weighted Rating (WR) =  (vv+m.R)+(mv+m.C) 
where,

v is the number of votes for the movie
m is the minimum votes required to be listed in the chart
R is the average rating of the movie
C is the mean vote across the whole report

In [5]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.813922356091031

In [15]:
m = vote_counts.quantile(0.40)
m

437.4000000000001

In [16]:
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [17]:
qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(896, 6)

In [18]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [27]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [28]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [29]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
96,Inception,2010,13752,8,167.58371,"[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...",7.932612
65,The Dark Knight,2008,12002,8,187.322927,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 28, ""name...",7.923132
95,Interstellar,2014,10867,8,724.247784,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 18, ""...",7.915414
662,Fight Club,1999,9413,8,146.757391,"[{""id"": 18, ""name"": ""Drama""}]",7.902929
262,The Lord of the Rings: The Fellowship of the Ring,2001,8705,8,138.049577,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",7.895411
329,The Lord of the Rings: The Return of the King,2003,8064,8,123.630332,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",7.887526
330,The Lord of the Rings: The Two Towers,2002,7487,8,106.914973,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",7.879336
2917,Star Wars,1977,6624,8,126.393695,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 28, ""...",7.864589
77,Inside Out,2015,6560,8,128.655964,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 35, ""name...",7.863351
2291,Back to the Future,1985,6079,8,76.603233,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 35, ""...",7.853264


In [33]:
import ast
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [34]:
movies['genres'] = movies['genres'].apply(convert)

In [35]:
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = movies.drop('genres', axis=1).join(s)

  s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)


In [36]:
def build_chart(genre, percentile=0.40):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [37]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
25,Titanic,1997,7562,7,100.025899,6.952274
81,Maleficent,2014,4496,7,110.620647,6.922028
2003,Her,2013,4097,7,53.682367,6.91502
2158,Eternal Sunshine of the Spotless Mind,2004,3652,7,56.481487,6.905551
1701,Aladdin,1992,3416,7,92.982009,6.89962
2553,The Theory of Everything,2014,3311,7,61.182331,6.896735
1263,Amélie,2001,3310,7,73.720244,6.896707
100,The Curious Case of Benjamin Button,2008,3292,7,60.269279,6.896195
1372,The Devil Wears Prada,2006,3088,7,83.893257,6.890021
1565,The Notebook,2004,3067,7,55.109138,6.889344


# Content Based Recommender
They suggest similar items based on a particular item. This system uses item metadata, such as genre, director, description, actors, etc. for movies, to make these recommendations. The general idea behind these recommender systems is that if a person liked a particular item, he or she will also like an item that is similar to it.

I have build content based recommender system based on:
Genres, keywords, title, overview, cast and crew

In [41]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [42]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


Function to fetch top 3 cast of the movie

In [43]:
def convert2(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [44]:
movies['cast'] = movies['cast'].apply(convert2)

Function to fetch the director of the movie

In [45]:
def fetch_director(obj):
    L = []

    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [46]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [47]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [49]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [50]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [51]:
final= movies[['movie_id','title','tags']]

In [52]:
final

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4779,2292,Clerks,"[Convenience, and, video, store, clerks, Dante..."
4787,255266,Dry Spell,"[Sasha, tries, to, get, her, soon-to-be, ex, h..."
4797,157185,Tin Can Man,"[Recently, dumped, by, his, girlfirend, for, a..."
4802,14337,Primer,"[Friends/fledgling, entrepreneurs, invent, a, ..."


In [53]:
final['tags'] = final['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['tags'] = final['tags'].apply(lambda x: " ".join(x))


In [54]:
final.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [56]:
final['tags'] = final['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['tags'] = final['tags'].apply(lambda x:x.lower())


In [57]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [58]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return  " ".join(y)

In [60]:
final['tags'] = final['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['tags'] = final['tags'].apply(stem)


In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english',max_features=5000)

tfidf_matrix = tfidf.fit_transform(final['tags'].fillna(''))

tfidf_matrix.shape

(1494, 5000)

# Cosine Similarity
I will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two movies. Mathematically, it is defined as follows:

In [64]:
from sklearn.metrics.pairwise import cosine_similarity


In [81]:
similarity = cosine_similarity(tfidf_matrix)

In [82]:
def recommend(movie):
    movie_index = final[final['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True, key=lambda x:x[1])[1:11]
    
    for i in movies_list:
        print(final.iloc[i[0]].title)
        

In [85]:
recommend('Avatar')

Aliens vs Predator: Requiem
Battle: Los Angeles
Meet Dave
Jupiter Ascending
Beowulf
Apollo 18
Attack the Block
Aliens in the Attic
Edge of Tomorrow
Ender's Game


# Saving as Pickle file

In [None]:
pickle.dump(final.to_dict(),open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))