In [None]:
# import packages
import pandas as pd
from datetime import datetime

import ast

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
# read the datasets
cleaned_data_path = 'cleaned_data/'

movies_cleaned = pd.read_csv(cleaned_data_path + 'movies_cleaned.csv')
ratings_cleaned = pd.read_csv(cleaned_data_path + 'ratings_cleaned.csv')
ratings_small_cleaned = pd.read_csv(cleaned_data_path + 'ratings_small_cleaned.csv')

genre_map = pd.read_excel(cleaned_data_path + 'movie_metadata_supporting.xlsx', sheet_name = 'genres')
keyword_map = pd.read_excel(cleaned_data_path + 'movie_metadata_supporting.xlsx', sheet_name = 'keywords')

In [None]:
movies_cleaned['genre_list'] = movies_cleaned['genre_list'].apply(ast.literal_eval)

# read the columns as list and map the dict
def col_list_map(original_data, mapping_dict):
    try:
        final_list = []
        for item in original_data:
            final_list.append(mapping_dict[item])
        return final_list
    except:
        return None

# get dummy variables of genres
movies_cleaned['genre_name_list'] = movies_cleaned['genre_list'].apply(lambda x: col_list_map(x, dict(zip(genre_map['id'], genre_map['name']))))

mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(
    mlb.fit_transform(movies_cleaned['genre_name_list']),
    columns = [f"genre_{c}" for c in mlb.classes_],
    index = movies_cleaned.index
)

movies_cleaned = pd.concat([movies_cleaned, genre_dummies], axis=1)

In [None]:
genre_columns = [col for col in movies_cleaned.columns if col.startswith('genre') and 'name' not in col and 'list' not in col]

In [None]:
test_user_id = 42

# Top Weighted Rating Recommendation

In [None]:
# weighted_rating = (v*R+m*C)/(v+m)
# R = vote_average for a movie
# v = vote_count for a movie
# m = the threshold of vote_count for qualified movies
# C = average vote_average for all movies

In [None]:
print('All movie count:', movies_cleaned.shape[0])
threshold = 0.75
qualify_movies = movies_cleaned[(movies_cleaned['vote_count'] > movies_cleaned['vote_count'].quantile(threshold)) & \
                                (movies_cleaned['vote_average'].notna())].copy()
print('Threshold for voting_count:', movies_cleaned['vote_count'].quantile(threshold))
print('Qualified movie count:', qualify_movies.shape[0])

In [None]:
m = movies_cleaned['vote_count'].quantile(threshold)
C = movies_cleaned['vote_average'].mean()

def weighted_rating(x, m, C):
    v = x['vote_count']
    R = x['vote_average']
    return (v*R+m*C)/(v+m)

qualify_movies['weighted_rating'] = qualify_movies.apply(lambda x: weighted_rating(x, m, C), axis=1)

In [None]:
# top 20 rating movies
used_columns = ['imdb_id', 'title', 'vote_count', 'vote_average']

top_20_movies = qualify_movies.sort_values('weighted_rating', ascending = False).head(20)[used_columns + ['weighted_rating']].copy().reset_index(drop = True)
top_20_movies

In [None]:
# top rating movies by genre
top_10_genre_movies = pd.DataFrame()

for col in genre_columns:

    mv_df = qualify_movies[qualify_movies[col] == 1].sort_values('weighted_rating', ascending = False).head(10)[used_columns + ['weighted_rating']]
    mv_df['genre'] = col.split('_')[1]

    top_10_genre_movies = pd.concat([top_10_genre_movies, mv_df], ignore_index = True)

top_10_genre_movies

# Recommendation by Favorite Genre

In [None]:
user_rating = ratings_cleaned[ratings_cleaned['user_id'] == test_user_id].copy()
user_rating = user_rating.sort_values('rating', ascending = False)
user_rating = pd.merge(user_rating, movies_cleaned[['imdb_id', 'title'] + genre_columns], on = 'imdb_id', how='left')

In [None]:
# find the genres of the favorite movies
fav_genre = []
for genre in genre_columns:
    if user_rating[user_rating['rating'] == user_rating['rating'].max()][genre].sum() > 0:
        fav_genre.append(genre)

fav_genre

In [None]:
# recommendations
qualify_movies[(qualify_movies[fav_genre].any(axis=1)) & (~qualify_movies['imdb_id'].isin(user_rating['imdb_id']))].\
            sort_values('weighted_rating', ascending = False).head(10)[used_columns + ['weighted_rating']]

# Collaborative Filtering

In [None]:
# create the user similarity matrix and get the top 10 similar users

rating_matrix = ratings_small_cleaned.pivot_table(index='user_id', columns='imdb_id', values='rating')

user_sim_matrix = pd.DataFrame(cosine_similarity(rating_matrix.fillna(0)), 
                               index = rating_matrix.index, 
                               columns = rating_matrix.index)

sim_users = user_sim_matrix[test_user_id].sort_values(ascending = False)[1:11].index

In [None]:
# calculate the weighed average of the movie rating
weighted_sum = rating_matrix[rating_matrix.index.isin(sim_users)].\
               mul(user_sim_matrix.loc[user_sim_matrix.index.isin(sim_users), test_user_id], axis=0)

total_sim = rating_matrix[rating_matrix.index.isin(sim_users)].copy()
total_sim[total_sim.notna()] = 1
total_sim = total_sim.mul(user_sim_matrix.loc[user_sim_matrix.index.isin(sim_users), test_user_id], axis=0)

weighted_score = (weighted_sum.sum() / total_sim.sum()).round(4)

# combine weighted score with average similarity
score_sim = pd.DataFrame({'weighted_score': weighted_score, 'avg_sim': total_sim.mean()})

In [None]:
# remove watched movies
test_id_watched = ratings_small_cleaned[ratings_small_cleaned['user_id'] == test_user_id]['imdb_id']
score_sim = score_sim[~score_sim.index.isin(test_id_watched)].sort_values(['weighted_score', 'avg_sim'], ascending = [False, False])

In [None]:
# get the top 10 score movies
cf_recommend_10 = score_sim.head(10).index


cf_recommend_10_df = movies_cleaned[movies_cleaned['imdb_id'].isin(cf_recommend_10)][used_columns].copy().merge(
                        score_sim[~score_sim.index.isin(test_id_watched)],
                        how='left',
                        left_on='imdb_id',
                        right_index=True).\
                        sort_values(['weighted_score', 'avg_sim', 'vote_average'], ascending = [False, False, False]).reset_index(drop = True)

cf_recommend_10_df