In [1]:
import pandas as pd
import numpy as np

from utils.utils import get_recs_for_all_users, get_users_faves, sort_probs, get_helper_vals, calc_personalization, mapk, mean_precision

In [2]:
NUM_MOVIES = 1056
NUM_TOPICS = 10

In [3]:
julia_out = pd.read_csv('cache/julia_out.csv')

In [4]:
julia_out

Unnamed: 0,doc_id,highest_prob_topic,highest_prob
0,1,10,0.295301
1,2,4,0.272665
2,3,5,0.249069
3,4,1,0.523753
4,5,2,0.227720
...,...,...,...
1051,1052,7,0.304045
1052,1053,5,0.398105
1053,1054,4,0.317217
1054,1055,1,0.220504


In [5]:
movie_topics_probs = pd.DataFrame(julia_out)
movie_topics_probs = movie_topics_probs.rename(columns={'highest_prob_topic': 'Topic', 'highest_prob': 'Probability'})
movie_topics_probs = movie_topics_probs.drop(columns="doc_id")
# Subtract 1 from each topic number -- Julia indexed them starting at 1
movie_topics_probs['Topic'] = movie_topics_probs['Topic'].map(lambda t: t - 1)

In [6]:
movie_topics_probs

Unnamed: 0,Topic,Probability
0,9,0.295301
1,3,0.272665
2,4,0.249069
3,0,0.523753
4,1,0.227720
...,...,...
1051,6,0.304045
1052,4,0.398105
1053,3,0.317217
1054,0,0.220504


In [7]:
movies_ratings = pd.read_csv('./ml_netflix.csv')
unique_movies = movies_ratings.drop_duplicates('title')[['title', 'movieId']].reset_index(drop=True)

In [8]:
movie_topics_probs = movie_topics_probs.merge(unique_movies, left_index=True, right_index=True)
movies_ratings_probs = movies_ratings.merge(movie_topics_probs, how='left', on=['title', 'movieId'])

In [9]:
movie_topics_probs

Unnamed: 0,Topic,Probability,title,movieId
0,9,0.295301,Grown Ups,0
1,3,0.272665,Dark Skies,1
2,4,0.249069,Jaws,2
3,0,0.523753,Jaws 2,3
4,1,0.227720,Jaws: The Revenge,4
...,...,...,...,...
1051,6,0.304045,Young Adult,1051
1052,4,0.398105,"Yours, Mine and Ours",1052
1053,3,0.317217,Zodiac,1053
1054,0,0.220504,Zombieland,1054


In [10]:
movies_ratings_probs.dropna(inplace=True)

In [11]:
movies_ratings_probs = movies_ratings_probs.astype({'Topic': 'int32'})

In [12]:
# find all 4 and 5 star ratings
user_5_df, user_4_df = get_users_faves(movies_ratings_probs)

In [13]:
per_topic = sort_probs(int(NUM_TOPICS), movie_topics_probs)
per_topic

[    Topic  Probability                    title  movieId
 0       0     0.523753                   Jaws 2        3
 1       0     0.496216                  My Girl       59
 2       0     0.487854        Maps to the Stars      407
 3       0     0.467596              Blue Streak      584
 4       0     0.448317               The Square      524
 ..    ...          ...                      ...      ...
 95      0     0.187782            Step Brothers      885
 96      0     0.185598           Mansfield Park      770
 97      0     0.180065  Tom Segura: Disgraceful      440
 98      0     0.178359    Employee of the Month      645
 99      0     0.165741            Into the Wild      235
 
 [100 rows x 4 columns],
     Topic  Probability                           title  movieId
 0       1     0.541840                      Free Willy      671
 1       1     0.513821      The Twilight Saga: Eclipse       78
 2       1     0.479971  Jay and Silent Bob Strike Back      728
 3       1     0.

In [14]:
num_movies, num_users = get_helper_vals(movies_ratings_probs)

In [15]:
num_movies

1056

In [16]:
num_users

608

In [17]:
recommendations = get_recs_for_all_users(num_users, per_topic, user_5_df, user_4_df)
recommendations

[[0, 282, 837, 172, 651],
 [1, 320, 358, 588, 645, 594, 747, 739, 421, 30, 859, 1045, 32, 636, 382, 562],
 [2, 910, 530, 727, 17, 771, 46, 493, 770, 426],
 [3,
  20,
  530,
  214,
  321,
  279,
  870,
  200,
  135,
  86,
  245,
  834,
  83,
  30,
  639,
  713,
  299,
  37,
  437],
 [4, 733, 175, 784, 660, 668, 576, 898, 859, 903, 752, 299, 128, 37],
 [5, 10, 785, 827, 945],
 [6, 321, 820],
 [7, 321, 828, 452, 791],
 [8, 94, 880, 483, 86, 316, 679, 1045, 32, 927, 730, 15],
 [9, 248, 734],
 [10, 214, 781, 245, 36, 889, 61],
 [11,
  246,
  624,
  91,
  644,
  733,
  200,
  944,
  193,
  135,
  483,
  486,
  945,
  83,
  30,
  1045,
  32,
  752,
  713,
  1027,
  299,
  128,
  37,
  40,
  339],
 [12, 170],
 [13, 327, 870, 784, 668, 698, 1045, 37],
 [14, 944, 1014, 483, 1045, 986],
 [15, 279, 197, 483, 535, 86, 784, 1045, 324],
 [16, 327, 622, 72, 1038, 245, 958, 203, 413, 40],
 [17, 20, 296, 248, 279, 784, 668, 889, 860, 612, 251],
 [18, 327, 832, 668, 442],
 [19, 1001, 579, 119, 345, 282, 

In [18]:
calc_personalization(recommendations, num_movies, num_users)

Personalization: 0.9475353541452666


0.9475353541452666

In [19]:
mapk(recommendations, num_users, user_5_df, user_4_df)

MAP@K: 0.00030209144697672803


0.00030209144697672803

In [20]:
mean_precision(recommendations, num_users, user_5_df, user_4_df)

MEAN PRECISION (NOT mAP): 0.005107659247341662


0.005107659247341662