In [1]:
import pandas as pd
import pyarrow.parquet as pq

In [2]:
def calc_rating_avg_n_median(ratings_path):
    sum_ratings = {}
    count_ratings = {}
    all_ratings = {}
    
    chunksize = 10**6
    reader = pq.ParquetFile(ratings_path)
    
    for batch in reader.iter_batches(batch_size=chunksize):
        chunk = batch.to_pandas()
        grouped = chunk.groupby("movie_id")
        for movie_id, group in grouped:
            
            current_sum = group['rating'].sum()
            current_count = group['rating'].count()
            
            if movie_id in sum_ratings:
                sum_ratings[movie_id] += current_sum
                count_ratings[movie_id] += current_count
            else:
                sum_ratings[movie_id] = current_sum
                count_ratings[movie_id] = current_count
            
            ratings = group['rating'].tolist()
            
            if movie_id in all_ratings:
                all_ratings[movie_id].extend(ratings)
            else:
                all_ratings[movie_id] = ratings
    
    avg_ratings = {
        movie_id: sum_ratings[movie_id] / count_ratings[movie_id]
        for movie_id in sum_ratings
    }
    
    median_ratings = {}
    for movie_id in all_ratings:
        sorted_ratings = sorted(all_ratings[movie_id])
        n = len(sorted_ratings)
        if n % 2 == 1:
            median = sorted_ratings[n // 2]
        else:
            median = (sorted_ratings[n // 2 - 1] + sorted_ratings[n // 2]) / 2
        median_ratings[movie_id] = median
    
    return avg_ratings, median_ratings, count_ratings


def update_movie_titles(movie_titles_path, avg_ratings, median_ratings, view_counts):
    df_movies = pd.read_parquet(movie_titles_path)
    
    df_stats = pd.DataFrame({
        "movie_id": list(avg_ratings.keys()),
        "avg_rating": list(avg_ratings.values()),
        "median_rating": list(median_ratings.values()),
        "view_count": list(view_counts.values())
    })
    
    df_movies = df_movies.merge(df_stats, on="movie_id", how="left")
    
    return df_movies

avg_ratings, median_ratings, view_counts = calc_rating_avg_n_median("prod_data/all_ratings.parquet")

updated_movies = update_movie_titles(
    "prod_data/movie_titles.parquet", 
    avg_ratings, 
    median_ratings, 
    view_counts
)

-> not actually jupyter yet

In [3]:
updated_movies.head()

Unnamed: 0,movie_id,year,title,avg_rating,median_rating,view_count
0,1,2003,Dinosaur Planet,3.749543,4.0,547
1,2,2004,Isle of Man TT 2004 Review,3.558621,4.0,145
2,3,1997,Character,3.641153,4.0,2012
3,4,1994,Paula Abdul's Get Up & Dance,2.739437,3.0,142
4,5,2004,The Rise and Fall of ECW,3.919298,4.0,1140


In [4]:
unique_counts = updated_movies.nunique()
unique_counts

movie_id         17770
year                94
title            17359
avg_rating       16253
median_rating        9
view_count        6275
dtype: int64

In [5]:
updated_movies.dtypes

movie_id                  int32
year                      int16
title            string[python]
avg_rating              float64
median_rating           float64
view_count                int64
dtype: object

In [6]:
numeric_cols = updated_movies.select_dtypes(include=['int16', 'int32','int64','float64']).columns

# Her bir sayısal sütun için max ve min değerleri bulma
for col in numeric_cols:
    max_value = updated_movies[col].max()
    min_value = updated_movies[col].min()
    print(f"Column '{col}': Max = {max_value}, Min = {min_value}")


Column 'movie_id': Max = 17770, Min = 1
Column 'year': Max = 2005, Min = 1896
Column 'avg_rating': Max = 4.723269925683507, Min = 1.2878787878787878
Column 'median_rating': Max = 5.0, Min = 1.0
Column 'view_count': Max = 232944, Min = 3


In [7]:
updated_movies.to_parquet("prod_data/movie_titles_modified.parquet", engine="pyarrow", compression="snappy")
