In [3]:
import pandas as pd

In [4]:
movie_data = pd.read_csv("data/movies_metadata.csv")

In [12]:
movie_data.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [10]:
print(list(movie_data.columns))

['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']


In [18]:
movie_data.count()

adult                    45466
belongs_to_collection     4494
budget                   45466
genres                   45466
homepage                  7782
id                       45466
imdb_id                  45449
original_language        45455
original_title           45466
overview                 44512
popularity               45461
poster_path              45080
production_companies     45463
production_countries     45463
release_date             45379
revenue                  45460
runtime                  45203
spoken_languages         45460
status                   45379
tagline                  20412
title                    45460
video                    45460
vote_average             45460
vote_count               45460
dtype: int64

In [17]:
len(movie_data)

45466

### Making a basic recommender system base on movie rating

For this example, we'll use the IMDb rating formula:

$$WR = \frac{v*R+m*C}{v+m}$$

v - vote num, m - min vote require, R - avg rating, C - mean vote

In [25]:
# Look at the vote count at different percentile 
for i in range(5, 11):
    print("Quantile {} :".format(round(0.1 * i, 1)), round(movie_data['vote_count'].quantile(0.1 * i), 2))

Quantile 0.5 : 10.0
Quantile 0.6 : 15.0
Quantile 0.7 : 25.0
Quantile 0.8 : 50.0
Quantile 0.9 : 160.0
Quantile 1.0 : 14075.0


In [29]:
# There's a huge jump between quantile 0.9 - 1.0
for i in range(11):
    quan_count = round(movie_data['vote_count'].quantile(0.9+ 0.01 * i), 2)
    print("Quantile {} :".format(round(0.9 + 0.01 * i, 2)), quan_count)
    print("Count : ", len(movie_data[movie_data['vote_count'] > quan_count]))

Quantile 0.9 : 160.0
Count :  4538
Quantile 0.91 : 192.0
Count :  4082
Quantile 0.92 : 228.0
Count :  3628
Quantile 0.93 : 281.0
Count :  3179
Quantile 0.94 : 346.0
Count :  2727
Quantile 0.95 : 434.0
Count :  2268
Quantile 0.96 : 576.64
Count :  1819
Quantile 0.97 : 821.23
Count :  1364
Quantile 0.98 : 1236.82
Count :  910
Quantile 0.99 : 2183.82
Count :  455
Quantile 1.0 : 14075.0
Count :  0


In [30]:
# Although there's way more votes as we progress through the 90 percentile, more votes != high rating
# So to not lose some high rating movies with low vote count, let's use the top 15% vote count as min vote needed
m = movie_data['vote_count'].quantile(.85)
m

82.0

In [35]:
top_mdata = movie_data.copy()[movie_data['vote_count'] > m]

In [38]:
len(top_mdata)/len(movie_data)

0.1492763823516474

In [40]:
C = movie_data['vote_average'].mean()
C = round(C, 3)
C

5.618

In [41]:
def weightRating(df, m, C):
    v = df['vote_count']
    R = df['vote_average']
    return (v * R + m * C)/(v + m)

In [44]:
wRating = weightRating(top_mdata, m, C)

In [45]:
top_mdata['wRating'] = wRating

In [52]:
top_mdata.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,wRating
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,7.668942


In [54]:
rank = top_mdata.copy()[['original_title', 'wRating']]

In [60]:
rank = rank.sort_values('wRating', ascending=False)

In [64]:
# Top 100 movie according to the weight rating system
rank.head(100)

Unnamed: 0,original_title,wRating
10309,Dilwale Dulhania Le Jayenge,8.715715
314,The Shawshank Redemption,8.472000
834,The Godfather,8.461296
40251,君の名は。,8.287478
12481,The Dark Knight,8.282194
...,...,...
3189,City Lights,7.797483
15530,Mr. Nobody,7.789797
39085,Planet Earth,7.788667
12280,There Will Be Blood,7.787478
