In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
from collections import defaultdict

def get_cold_start_recommendations():
    movies = pd.read_parquet(r"parquet_creators_n_data/prod_data/movie_titles_modified.parquet")

    ratings_file = pq.ParquetFile(r"parquet_creators_n_data/prod_data/all_ratings.parquet")
    max_date_str = None
    
    for batch in ratings_file.iter_batches(columns=['date']):
        date_col = batch.column('date')
        current_max = pc.max(date_col).as_py()
        if current_max and (max_date_str is None or current_max > max_date_str):
            max_date_str = current_max
    
    max_date = pd.to_datetime(max_date_str)
    last_year = max_date.year

    last_year_counts = defaultdict(int)
    
    for batch in ratings_file.iter_batches(columns=['date', 'movie_id']):
        date_str = batch.column('date')
        try:
            dates = pc.strptime(date_str, format='%Y-%m-%d', unit='s').cast(pa.timestamp('s'))
        except:
            dates = date_str.cast(pa.date32())
        
        years = pc.year(dates)
        movie_ids = batch.column('movie_id')
        
        table = pa.Table.from_arrays([movie_ids, years], names=['movie_id', 'year'])
        
        filtered = table.filter(pc.equal(table['year'], last_year))
        if filtered.num_rows == 0:
            continue
            
        counts = filtered.group_by('movie_id').aggregate([('movie_id', 'count')])
        counts_df = counts.to_pandas()
        
        for _, row in counts_df.iterrows():
            last_year_counts[row['movie_id']] += row['movie_id_count']

    last_year_df = pd.DataFrame({
        'movie_id': list(last_year_counts.keys()),
        'last_year_views': list(last_year_counts.values())
    })
    
    merged = movies.merge(last_year_df, on='movie_id', how='left')
    merged['last_year_views'] = merged['last_year_views'].fillna(0).astype(int)

    # Weighted view score as popularity
    merged['popularity_score'] = 0.75 * merged['last_year_views'] + 0.25 * merged['view_count']
    
    top_20 = merged.nlargest(20, 'popularity_score')
    # pick top rated 6 out of popular 20 movies
    recommendations = top_20.nlargest(6, 'avg_rating')
    
    return recommendations[['movie_id', 'title', 'year', 
                            'avg_rating', 'view_count', 
                            'last_year_views']]

recs = get_cold_start_recommendations()
print("Cold Start Recommendations:\n")
print(recs.to_string(index=False))

Cold Start Recommendations:

 movie_id                                                  title  year  avg_rating  view_count  last_year_views
    10947                                        The Incredibles  2004    4.308871      133457           132611
    16377                                         The Green Mile  1999    4.306941      181426           102670
    11283                                           Forrest Gump  1994    4.299910      181508           103281
     1905 Pirates of the Caribbean: The Curse of the Black Pearl  2003    4.153908      193941            96319
     2372                                   The Bourne Supremacy  2004    3.909958      137170           116118
     6287                                           Pretty Woman  1990    3.905047      193295           104916


-> not actually jupyter yet