In [None]:
# import packages
import pandas as pd
from datetime import datetime

import ast

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [None]:
# read the datasets
orginal_data_path = 'original_data/'

movies_cleaned = pd.read_csv('movies_cleaned.csv')
ratings = pd.read_csv(orginal_data_path + 'ratings.csv')
links = pd.read_csv(orginal_data_path + 'links.csv')
# keywords = pd.read_csv(orginal_data_path + 'keywords.csv')
# credits = pd.read_csv(orginal_data_path + 'credits.csv')

ratings['date_time'] = pd.to_datetime(ratings['timestamp'], unit='s')

In [None]:
genre_columns = [col for col in movies_cleaned.columns if col.startswith('genre')]
genre_columns = genre_columns[2:]

Top Weighted Rating Movie Recommendation by Genre

In [None]:
# weighted_rating = (v*R+m*C)/(v+m)
# R = vote_average for a movie
# v = vote_count for a movie
# m = the threshold of vote_count for qualified movies
# C = average vote_average for all movies

In [None]:
print('All movie count:', movies_cleaned.shape[0])
threshold = 0.75
qualify_movies = movies_cleaned[(movies_cleaned['vote_count'] > movies_cleaned['vote_count'].quantile(threshold)) & \
                                (movies_cleaned['vote_average'].notna())].copy()
print('Threshold for voting_count:', movies_cleaned['vote_count'].quantile(threshold))
print('Qualified movie count:', qualify_movies.shape[0])

In [None]:
m = movies_cleaned['vote_count'].quantile(threshold)
C = movies_cleaned['vote_average'].mean()

def weighted_rating(x, m, C):
    v = x['vote_count']
    R = x['vote_average']
    return (v*R+m*C)/(v+m)

qualify_movies['weighted_rating'] = qualify_movies.apply(lambda x: weighted_rating(x, m, C), axis=1)

In [None]:
# top 20 rating movies
used_columns = ['imdb_id', 'original_title', 'vote_count', 'vote_average', 'weighted_rating']

top_20_movies = qualify_movies.sort_values('weighted_rating', ascending = False).head(20)[used_columns].copy()
top_20_movies

In [None]:
# top rating movies by genre
top_10_genre_movies = pd.DataFrame()

for col in genre_columns:

    mv_df = qualify_movies[qualify_movies[col] == 1].sort_values('weighted_rating', ascending = False).head(10)[used_columns]
    mv_df['genre'] = col.split('_')[1]

    top_10_genre_movies = pd.concat([top_10_genre_movies, mv_df], ignore_index = True)

top_10_genre_movies

Top Rating Movie Recommendation by the Genre of User Favorite Movies

In [None]:
user_id = 1

In [None]:
user_rating = ratings[ratings['userId'] == user_id].copy()
user_rating['imdb_id'] = user_rating['movieId'].map(dict(zip(links['movieId'], links['imdbId'])))

def format_tt_id(num):
    return f"tt{num:07d}"

user_rating['imdb_id'] = user_rating['imdb_id'].apply(lambda x: format_tt_id(x))

user_rating = user_rating.sort_values('rating', ascending = False)


user_rating = pd.merge(user_rating, movies_cleaned[['imdb_id', 'original_title'] + genre_columns], on='imdb_id', how='left')

In [None]:
high_user_rating = user_rating[user_rating['rating'] >=  user_rating['rating'].quantile(0.75)]

In [None]:
genre_sums = {}

for col in genre_columns:
    genre_sums[col] = high_user_rating[col].sum()

top_n = 3
sorted_vals = pd.Series(genre_sums).sort_values(ascending=False)

cutoff = sorted_vals.iloc[top_n - 1] 

top3_genres = sorted_vals[sorted_vals >= cutoff]

top3_genres

In [None]:
test = pd.DataFrame(top3_genres, columns = ['origin'])
test['weighted_avg'] = top3_genres / top3_genres.unique().sum()
test['origin_count'] = test['origin'].map(test['origin'].value_counts())
test['weighted_avg_split'] = test['weighted_avg'] / test['origin_count']
test['movie_counts'] = test['weighted_avg_split'] * 10

In [None]:
test