**Goal: Taking number of votes into account in order to make 'ratings' more meaningful**

In [0]:
from pyspark.sql.functions import *

In [0]:
def load_dataframe(file_path):
  df = spark.read.csv(file_path, header=True, inferSchema=True, sep='\t', nullValue=r'\N')
  return df

In [0]:
basics = load_dataframe('/mnt/data_source/TSV/title.basics.tsv')

In [0]:
ratings = load_dataframe('/mnt/data_source/TSV/title.ratings.tsv')

In [0]:
joined_df = basics.join(ratings, 'tconst')

**'Highest' rated movies**

In [0]:
display(joined_df.orderBy(desc('averageRating'),desc('numVotes')))

tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
tt2301451,tvEpisode,Ozymandias,Ozymandias,0,2013.0,,48.0,"Crime,Drama,Thriller",10.0,108673
tt9906260,tvEpisode,Hero,Hero,0,2019.0,,24.0,"Action,Adventure,Animation",10.0,22916
tt8084176,tvEpisode,407 Proxy Authentication Required,407 Proxy Authentication Required,0,2019.0,,56.0,"Crime,Drama,Thriller",10.0,5022
tt6735740,movie,Love in Kilnerry,Love in Kilnerry,0,2019.0,,100.0,Comedy,10.0,2836
tt10914342,movie,Kirket,Kirket,0,2019.0,,132.0,"Drama,Sport",10.0,587
tt10944636,short,Poaching,Poaching,0,2019.0,,,"Music,Short",10.0,551
tt7343554,tvEpisode,You and I Bee-come One,You and I Bee-come One,0,2017.0,,,"Action,Adventure,Animation",10.0,383
tt4172678,tvEpisode,Who's Giggling Now?,Who's Giggling Now?,0,2014.0,,23.0,"Family,Fantasy,Horror",10.0,320
tt4857956,tvEpisode,Where Dragons Dare,Where Dragons Dare,0,2015.0,,23.0,"Adventure,Animation,Comedy",10.0,291
tt7621898,video,Fergie: Tension,Fergie: Tension,0,2017.0,,4.0,"Music,Short",10.0,220


**Correlation between rating and number of votes**

In [0]:
display(joined_df.groupBy("averageRating").agg({'numVotes':'sum'}).orderBy(desc('averageRating')))

averageRating,sum(numVotes)
10.0,164789
9.9,730661
9.8,430515
9.7,837937
9.6,843905
9.5,3054076
9.4,2782514
9.3,4101440
9.2,3683408
9.1,2929561


**Weighting ratings using Bayesian estimate**

In [0]:
# https://stackoverflow.com/questions/1411199/what-is-a-better-way-to-sort-by-a-5-star-rating

# wr: weighted rating
# v: number of votes for the movie
# m: minimum votes required to be in the top 500 most voted
# R: average rating for the movie 
# C: the mean vote across the whole report (currently 7.0) (dimensionless)

# wr = (R * v + C * m) / (v + m);

In [0]:
C = joined_df.agg(avg(col('averageRating'))).first()[0]

In [0]:
m = joined_df.orderBy(desc('numVotes')).limit(500).orderBy(asc('numVotes')).first()['numVotes']

In [0]:
# wr = (R * v + C * m) / (v + m)
joined_df_rating = joined_df.withColumn('wr', (col('averageRating') * col('numVotes') + C * m) / (col('numVotes') + m))

In [0]:
display(joined_df_rating.orderBy(desc('wr')))

tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,wr
tt0111161,movie,The Shawshank Redemption,The Shawshank Redemption,0,1994,,142.0,Drama,9.3,2160364,9.007665342217653
tt0903747,tvSeries,Breaking Bad,Breaking Bad,0,2008,2013.0,49.0,"Crime,Drama,Thriller",9.5,1280780,9.007046462745885
tt0944947,tvSeries,Game of Thrones,Game of Thrones,0,2011,2019.0,57.0,"Action,Adventure,Drama",9.4,1606675,9.007037847324131
tt0068646,movie,The Godfather,The Godfather,0,1972,,175.0,"Crime,Drama",9.2,1482703,8.813116981186935
tt0468569,movie,The Dark Knight,The Dark Knight,0,2008,,152.0,"Action,Crime,Drama",9.0,2135266,8.741357737484895
tt0110912,movie,Pulp Fiction,Pulp Fiction,0,1994,,154.0,"Crime,Drama",8.9,1695672,8.599254271119863
tt0167260,movie,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King,0,2003,,201.0,"Adventure,Drama,Fantasy",8.9,1534048,8.572717989515032
tt1375666,movie,Inception,Inception,0,2010,,148.0,"Action,Adventure,Sci-Fi",8.8,1893504,8.539993066098972
tt0071562,movie,The Godfather: Part II,The Godfather: Part II,0,1974,,202.0,"Crime,Drama",9.0,1032435,8.526913990121438
tt0137523,movie,Fight Club,Fight Club,0,1999,,139.0,Drama,8.8,1725939,8.518463482542671


**... makes more sense to me.**