In [54]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [55]:
path_prefix = '/home/joeyresuento/Projects/data_training/data_sets/ml-25m'

In [56]:
movies_df = pd.read_csv(f'{path_prefix}/movies.csv')
ratings_df = pd.read_csv(f'{path_prefix}/ratings.csv')

In [57]:
movies_table = pa.Table.from_pandas(movies_df)
ratings_table = pa.Table.from_pandas(ratings_df)

pq.write_table(movies_table, f'{path_prefix}/movies.parquet')
pq.write_table(ratings_table, f'{path_prefix}/ratings.parquet')

In [58]:
movies = pq.read_table(f'{path_prefix}/movies.parquet')
ratings = pq.read_table(f'{path_prefix}/ratings.parquet')

In [59]:
movies.to_pandas()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [60]:
ratings_df0 = ratings.to_pandas()
ratings_df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [61]:
ratings_df0.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [62]:
overall_mean = ratings_df0['rating'].mean()

In [63]:
rating_count = ratings_df0.groupby("movieId").count()['rating']
rating_count.head()

movieId
1    57309
2    24228
3    11804
4     2523
5    11714
Name: rating, dtype: int64

In [64]:
ratings_df1 = ratings_df0.groupby('movieId').mean()[['rating']]

In [65]:
ratings_df1['count'] = rating_count
# https://en.wikipedia.org/wiki/IMDb#Rankings
m = 25000
v = ratings_df1['count']
R = ratings_df1['rating']
ratings_df1['weighted_mean'] = ((R * v) + (overall_mean * m)) / (v + m)

In [66]:
ratings_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59047 entries, 1 to 209171
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   rating         59047 non-null  float64
 1   count          59047 non-null  int64  
 2   weighted_mean  59047 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 1.8 MB


In [67]:
movies_df0 = movies.to_pandas()
movies_df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [71]:
movies_n_ratings_list = ratings_df1.index.to_list()
movies_df1 = movies_df0[ movies_df0.movieId.map(lambda x: x in movies_n_ratings_list) ]
movies_df1 = movies_df1.set_index('movieId')

In [74]:
movies_ratings_df = pd \
    .concat([ratings_df1, movies_df1], axis=1) \
    .sort_values(by='weighted_mean', ascending=False)

In [77]:
movies_ratings_df.head(20)

Unnamed: 0_level_0,rating,count,weighted_mean,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
318,4.413576,81482,4.207034,"Shawshank Redemption, The (1994)",Crime|Drama
858,4.324336,52498,4.069335,"Godfather, The (1972)",Crime|Drama
50,4.284353,55366,4.05089,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
527,4.247579,60411,4.03867,Schindler's List (1993),Drama|War
296,4.188912,79672,4.032457,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2959,4.228311,58773,4.021067,Fight Club (1999),Action|Crime|Drama|Thriller
593,4.151342,74127,3.99561,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
2571,4.154099,72674,3.995345,"Matrix, The (1999)",Action|Sci-Fi|Thriller
260,4.120189,68717,3.963778,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
1196,4.144122,57361,3.958881,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
