### Create cols avg, view, last year view etc.

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
from collections import defaultdict

def get_enriched_movie():
    movies = pd.read_parquet(r"parquet_creators_n_data/prod_data/movie_titles_modified.parquet")

    ratings_file = pq.ParquetFile(r"parquet_creators_n_data/prod_data/all_ratings.parquet")
    max_date_str = None
    
    for batch in ratings_file.iter_batches(columns=['date']):
        date_col = batch.column('date')
        current_max = pc.max(date_col).as_py()
        if current_max and (max_date_str is None or current_max > max_date_str):
            max_date_str = current_max
    
    max_date = pd.to_datetime(max_date_str)
    last_year = max_date.year

    last_year_counts = defaultdict(int)
    
    for batch in ratings_file.iter_batches(columns=['date', 'movie_id']):
        date_str = batch.column('date')
        try:
            dates = pc.strptime(date_str, format='%Y-%m-%d', unit='s').cast(pa.timestamp('s'))
        except:
            dates = date_str.cast(pa.date32())
        
        years = pc.year(dates)
        movie_ids = batch.column('movie_id')
        
        table = pa.Table.from_arrays([movie_ids, years], names=['movie_id', 'year'])
        
        filtered = table.filter(pc.equal(table['year'], last_year))
        if filtered.num_rows == 0:
            continue
            
        counts = filtered.group_by('movie_id').aggregate([('movie_id', 'count')])
        counts_df = counts.to_pandas()
        
        for _, row in counts_df.iterrows():
            last_year_counts[row['movie_id']] += row['movie_id_count']

    last_year_df = pd.DataFrame({
        'movie_id': list(last_year_counts.keys()),
        'last_year_views': list(last_year_counts.values())
    })
    
    merged = movies.merge(last_year_df, on='movie_id', how='left')
    merged['last_year_views'] = merged['last_year_views'].fillna(0).astype(int)

    # Weighted view score as popularity
    merged['popularity_score'] = 0.75 * merged['last_year_views'] + 0.25 * merged['view_count']
    # merged['popularity_score'] = merged['popularity_score'] * merged["avg_rating"]
    # top_20 = merged.nlargest(20, 'popularity_score')
    # pick top rated 6 out of popular 20 movies
    # recommendations = top_20.nlargest(6, 'avg_rating')
    return merged
    # return recommendations[['movie_id', 'title', 'year', 
                            # 'avg_rating', 'view_count', 
                            # 'last_year_views']]


In [2]:
movie_db = get_enriched_movie().sort_values(by='popularity_score', ascending=False)

In [3]:
movie_db["view_old"] = movie_db["view_count"] - movie_db["last_year_views"]

### COLD START RECOMMENDATION(TASK 1):

In [4]:
movie_db['norm_rating'] = (movie_db['avg_rating'] - movie_db['avg_rating'].min()) / (movie_db['avg_rating'].max() - movie_db['avg_rating'].min())

movie_db['norm_views'] = (movie_db['view_count'] - movie_db['view_count'].min()) / (movie_db['view_count'].max() - movie_db['view_count'].min())

# Magic weight numbers -> adds bias to reccommend higher rated views 
# (optional for "pirates of the carrabian" lovers)
movie_db["norm_rating"] = movie_db["norm_rating"] * 1.2
movie_db["norm_views"] = movie_db["norm_views"] * 0.8

In [5]:
movie_db['combined_score'] = movie_db['norm_rating'] + movie_db['norm_views']
movie_db.sort_values(by='combined_score', ascending=False, inplace=True)

In [6]:
movie_db[["title", "avg_rating", "view_count","last_year_views","combined_score","year"]].head(7)

Unnamed: 0,title,avg_rating,view_count,last_year_views,combined_score,year
16376,The Green Mile,4.306941,181426,102670,1.677643,1999
11282,Forrest Gump,4.29991,181508,103281,1.675469,1994
1904,Pirates of the Caribbean: The Curse of the Bla...,4.153908,193941,96319,1.667169,2003
14549,The Shawshank Redemption: Special Edition,4.593384,139660,76968,1.634261,1994
11520,Lord of the Rings: The Two Towers,4.460676,151245,70404,1.627692,2002
2451,Lord of the Rings: The Fellowship of the Ring,4.434708,149866,67498,1.613885,2001
14239,Lord of the Rings: The Return of the King,4.545121,134284,70560,1.598939,2003


### USER RECOMMENDATION(TASK 2):

In [7]:
all_ratings = pd.read_parquet(r"parquet_creators_n_data/prod_data/all_ratings.parquet")

In [8]:
def recommend_movies(df, target_user, min_rating=4, top_n=5):
    # Get the target user's high-rated movies
    target_high_rated = df[(df['user_id'] == target_user) & (df['rating'] >= min_rating)]
    if target_high_rated.empty:
        return ValueError("Target user has no high-rated movies.")
    
    high_rated_movies = target_high_rated['movie_id'].unique()
    
    matching_users = df[
        (df['movie_id'].isin(high_rated_movies)) &
        (df['rating'] >= min_rating)
    ]['user_id'].unique()
    
    # Exclude already watched
    watched_movies = df[df['user_id'] == target_user]['movie_id'].unique()
    

    recommendations = df.query(
    "user_id in @matching_users "
    "and movie_id not in @watched_movies "
    "and rating >= @min_rating"
)
    
    if recommendations.empty:
        return pd.DataFrame({"message": ["No recommendation :("]})
    
    rec_agg = recommendations.groupby('movie_id').agg(
        mean_rating=('rating', 'mean'),
        num_ratings=('rating', 'count')
    ).reset_index()
    
    top_recommendations = rec_agg.sort_values(
        by=['num_ratings', 'mean_rating'],
        ascending=[False, False]
    ).head(top_n)
    
    recommendation_id = top_recommendations[["movie_id"]]
    
    
    return recommendation_id


In [9]:

# CHANGE WITH TARGET USER
target_user_id = 885013
rec_movies = recommend_movies(all_ratings, target_user_id)

In [10]:
temp = all_ratings[all_ratings['user_id'] == target_user_id].sort_values(by='rating', ascending=False).head(10)
checkino = movie_db.merge(temp, on='movie_id')
checkino[["title","rating"]]

Unnamed: 0,title,rating
0,Family Guy: Vol. 2: Season 3,5
1,The Simpsons: Season 5,5
2,Dr. Strangelove,5
3,Dodgeball: A True Underdog Story,5
4,Angel: Season 2,5
5,The Boondock Saints,5
6,South Park: Passion of the Jew,5
7,Black Adder II,5
8,Columbo: Season 3,5
9,Steamboy,5


In [11]:
# mergino raggiano
recommendino = movie_db.merge(rec_movies, on='movie_id')
recommendino


Unnamed: 0,movie_id,year,title,avg_rating,median_rating,view_count,last_year_views,popularity_score,view_old,norm_rating,norm_views,combined_score
0,4306,1999,The Sixth Sense,4.325245,4.0,151292,79593,97517.75,71699,1.060968,0.519579,1.580547
1,6287,1990,Pretty Woman,3.905047,4.0,193295,104916,127010.75,88379,0.91419,0.663832,1.578022
2,3962,2003,Finding Nemo (Widescreen),4.415523,5.0,140979,69711,87528.0,71268,1.092502,0.48416,1.576663
3,14313,2000,The Patriot,3.783854,4.0,200832,113646,135442.5,87186,0.871857,0.689716,1.561573
4,6037,2002,The Bourne Identity,3.973455,4.0,158601,85211,103558.5,73390,0.938086,0.54468,1.482766


### COMPARE MOVIES(Task 3):

In [12]:
movie_db[["title", "avg_rating", "view_count","last_year_views","combined_score","year", "movie_id"]]

Unnamed: 0,title,avg_rating,view_count,last_year_views,combined_score,year,movie_id
16376,The Green Mile,4.306941,181426,102670,1.677643,1999,16377
11282,Forrest Gump,4.299910,181508,103281,1.675469,1994,11283
1904,Pirates of the Caribbean: The Curse of the Bla...,4.153908,193941,96319,1.667169,2003,1905
14549,The Shawshank Redemption: Special Edition,4.593384,139660,76968,1.634261,1994,14550
11520,Lord of the Rings: The Two Towers,4.460676,151245,70404,1.627692,2002,11521
...,...,...,...,...,...,...,...
17576,Absolution,1.400000,125,125,0.039584,2003,17577
17428,Vampire Assassins,1.396761,247,247,0.038871,2005,17429
11767,Alone in a Haunted House,1.375610,205,98,0.031339,2004,11768
16486,Zodiac Killer,1.346021,289,289,0.021292,2004,16487


In [13]:
# Platfordaki filmlerim izlenmesiyle getirisinin paralel olduğunu varsayarsak
def compare(movie_1_id, movie_2_id):
    movie_1 = movie_db[movie_db["movie_id"] == movie_1_id]
    movie_2 = movie_db[movie_db["movie_id"] == movie_2_id]
    
    if movie_1["last_year_views"].values[0] > movie_2["last_year_views"].values[0]:
        return movie_1, movie_2
    else:
        return movie_2, movie_1

good_movie,bad_movie = compare(1905, 11521)

In [14]:
good_movie[["year", "title", "last_year_views", "combined_score"]]

Unnamed: 0,year,title,last_year_views,combined_score
1904,2003,Pirates of the Caribbean: The Curse of the Bla...,96319,1.667169


In [15]:
bad_movie[["year", "title", "last_year_views", "combined_score"]]

Unnamed: 0,year,title,last_year_views,combined_score
11520,2002,Lord of the Rings: The Two Towers,70404,1.627692


son 1 yıldaki izlenmesi daha yüksek olan "pirates of the carribean" tercih edilmelidir :(