# Setup

In [None]:
# this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


FOLDERNAME = 'cs229_proj/'


assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd drive/My\ Drive/$FOLDERNAME/

Mounted at /content/drive
/content/drive/My Drive/cs229_proj


In [None]:
!pwd
!ls

/content/drive/My Drive/cs229_proj
 archive_data			     movies_embeddings_full.csv
 baseline_models_2.ipynb	     movies_full.csv
 baseline_models.ipynb		     movies_svd_full.csv
 boxd_scrape.ipynb		     movie_to_ebert_rating.json
 boxd_scrape_jerry.ipynb	     movie_to_jesse_rating.json
 collect_movie_posters.ipynb	     poster_dataset.py
'Copy of more_models.ipynb'	     __pycache__
 ebert_html			     recommendations.ipynb
 embeddings.json		    'related papers'
 html				     resnet_transfer_learning
 image_embeddings_playground.ipynb   rotten_tomatoes_movies.csv
 imagenet_classes.txt		     SampleMoviePosters
 jesse_html			    '*submission code*'
 letterboxd_posters		     tmdb_5000_credits.csv
 mlp.model			     tmdb_5000_movies.csv
 more_models.ipynb		     user_ratings_full.csv
 MovieGenre.csv			     user_to_input_and_holdout_dicts.json
 movieid_to_data.json		     user_to_rating.csv
 movie_industry.csv		     user_to_rating_dict.json


In [None]:
from IPython.display import Image
import json
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

import torch
from torch import nn

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from sklearn.impute import SimpleImputer

# Load Files

In [None]:
movies_full_path = '/content/drive/My Drive/cs229_proj/movies_full.csv'
user_ratings_full_path = '/content/drive/My Drive/cs229_proj/user_ratings_full.csv'

movies_full_df = pd.read_csv(movies_full_path)
ratings_full_df = pd.read_csv(user_ratings_full_path)

In [None]:
print(movies_full_df.shape)
print(ratings_full_df.shape)

(2220, 55)
(392551, 3)


In [None]:
# print(movies_full_df.columns)

all_columns = ['movie_id', 'poster_path', 'title',
       'year', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       'tmdb_budget', 'imdb_budget', 'tmdb_revenue', 'imdb_revenue',
       'tmdb_vote_average', 'tmdb_vote_count', 'imdb_vote_average',
       'imdb_vote_count', 'tmdb_popularity', 'tmdb_runtime', 'imdb_runtime',
       'main_genre', 'genres', 'director', 'writer', 'main_actor',
       'mpaa_rating', 'overview', 'tagline', 'keywords', 'release_date',
       'main_prod_company', 'production_companies', 'country',
       'production_countries', 'original_language', 'spoken_languages',
       'rt_info', 'critics_consensus', 'actors', 'tm_status', 'tm_rating',
       'tm_count', 'audience_status', 'audience_rating', 'audience_count',
       'tm_top_critics_count', 'tm_fresh_critics_count',
       'tm_rotten_critics_count', 'ebert_rating', 'boxd_vote_average']
print(len(all_columns))

55


# Baseline for Recommendation Model

## Create Input and Hold-out Sets for Each User

In [None]:
def get_input_and_holdout(movieid_lst, user_rating_lst, frac=0.8):
    n = len(movieid_lst)
    n_holdout_goal = int((1 - frac) * n)
    avg_rating = np.mean(np.array(user_rating_lst))

    all_indices = [*range(n)]

    if n <= 20:
        return None, None

    for i in range(1000):
        cur_indices = random.sample(all_indices, n_holdout_goal)
        holdout_ratings = np.array(user_rating_lst)[cur_indices]
       
        if abs(np.mean(holdout_ratings) - avg_rating) < 0.5:
            input_dict = {}
            holdout_dict = {}
            for j in range(n):
                cur_movieid = movieid_lst[j]
                if j in cur_indices:
                    holdout_dict[cur_movieid] = user_rating_lst[j]
                else:
                    input_dict[cur_movieid] = user_rating_lst[j]
            return input_dict, holdout_dict
    return None, None

In [None]:
count = 0
bruh_count = 0
user_to_dicts = {}
for user_id in users_set:
    count += 1
    user_df = ratings_full_df.loc[ratings_full_df['user_id'] == user_id]
    user_movieid = user_df['movie_id'].to_list()
    user_ratings = user_df['rating_val'].to_list()
    input_dict, holdout_dict = get_input_and_holdout(user_movieid, user_ratings)
    if input_dict is None:
        bruh_count += 1
    else:
        user_to_dicts[user_id] = [input_dict, holdout_dict]
    if count % 100 == 0:
        print(count, bruh_count)

100 15
200 36
300 54
400 66
500 82
600 96
700 110
800 122
900 132
1000 151
1100 176
1200 194
1300 213
1400 227
1500 245
1600 261
1700 271
1800 291
1900 302
2000 319
2100 332
2200 348
2300 368
2400 390
2500 405
2600 419
2700 431
2800 440
2900 447
3000 466
3100 485
3200 495
3300 511
3400 529
3500 539
3600 551
3700 566
3800 580
3900 591
4000 602
4100 619
4200 633
4300 645
4400 661
4500 684
4600 698
4700 717
4800 730
4900 746
5000 763
5100 784
5200 805


In [None]:
# with open("user_to_input_and_holdout_dicts.json", "w") as outfile:
#     json.dump(user_to_dicts, outfile)

## Baseline Model

In [None]:
user_to_dicts_file_path = '/content/drive/My Drive/cs229_proj/user_to_input_and_holdout_dicts.json'
user_to_dicts = {}
with open(user_to_dicts_file_path, 'r') as fh:
    user_to_dicts = json.load(fh)

In [None]:
print(len(user_to_dicts) / 5227)

0.845035393150947


In [None]:
movieid_lst = list(movies_full_df['movie_id'].to_list())
boxd_avg_lst = list(movies_full_df['boxd_vote_average'].to_list())

zipped_lists = zip(boxd_avg_lst, movieid_lst)
sorted_pairs = sorted(zipped_lists)

tuples = zip(*sorted_pairs)
boxd_avg_lst_sorted, movieid_lst_sorted = [ list(tuple) for tuple in  tuples]
boxd_avg_lst_sorted = list(reversed(boxd_avg_lst_sorted))
movieid_lst_sorted = list(reversed(movieid_lst_sorted))

In [None]:
def get_recommendations_score(input_dict, holdout_dict, movieid_lst):
    recommendations = []
    holdout_ratings = np.array(list(holdout_dict.values()))
    holdout_ratings_chosen = []
    for movie_id in movieid_lst:
        if movie_id not in input_dict:
            if movie_id in holdout_dict:
                holdout_ratings_chosen.append(holdout_dict[movie_id])
            recommendations.append(movie_id)
            if len(recommendations) >= 100: # and len(holdout_ratings_chosen) >= 2:
                break
    if len(holdout_ratings_chosen) == 0:
        return None, None, None
    holdout_avg = np.mean(holdout_ratings)
    holdout_ratings_chosen = np.array(holdout_ratings_chosen)
    return np.mean(holdout_ratings_chosen) - holdout_avg, recommendations, len(holdout_ratings_chosen)

In [None]:
scores = []
holdout_counts = []
recc_counts = []
for user in user_to_dicts:
    input_dict, holdout_dict = user_to_dicts[user]
    score, reccs, holdout_count = get_recommendations_score(input_dict, holdout_dict, movieid_lst_sorted)
    if score is not None:
        scores.append(score)
        holdout_counts.append(holdout_count)
        recc_counts.append(len(reccs))
    # print(score)

scores = np.array(scores)
print("Percentage of users that had a hold-out movie in the top 100:", len(scores) / len(user_to_dicts))

print(np.mean(scores)) # Mean of scores for users with at least one hold-out movie
print(np.mean(holdout_counts))
print(np.mean(recc_counts))