# Popularity-based Filtering

In [1]:
import pandas
movies = pandas.read_csv("data/movies.csv")
credits = pandas.read_csv("data/credits.csv")
ratings = pandas.read_csv("data/ratings.csv")

### Calculate the weighted rating and add it as column to the csv file

WR = (v / (v+m)) * R + (m / (v+m)) * C

v = number of votes for a movie  
m = minimum number of votes required  
R = average rating of the movie  
C = average rating across all movies  

In [2]:
m = movies["vote_count"].quantile(0.9)
m

1838.4000000000015

In [3]:
C = movies["vote_average"].mean()
C

6.092171559442016

In [4]:
def weighted_rating(df, m=m, C=C):
    v = df["vote_count"]
    R = df["vote_average"]
    WR = (v / (v+m)) * R + (m / (v+m)) * C
    return WR

In [5]:
movies["weighted_rating"] = movies.apply(weighted_rating, axis=1)

### Print the top 10 movies with the highest weighted rating

In [6]:
movies.sort_values("weighted_rating", ascending=False)[["title", "weighted_rating"]].head(10)

Unnamed: 0,title,weighted_rating
1881,The Shawshank Redemption,8.059258
662,Fight Club,7.939256
65,The Dark Knight,7.92002
3232,Pulp Fiction,7.904645
96,Inception,7.863239
3337,The Godfather,7.851236
95,Interstellar,7.809479
809,Forrest Gump,7.803188
329,The Lord of the Rings: The Return of the King,7.727243
1990,The Empire Strikes Back,7.697884
