### Import Packages

In [255]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD

### Check GPU is available

In [256]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


### Data processing

In [257]:
# 檔案路徑
data_csv_path = "./Data/netflix_drop/netflix_data_25M_drop.csv"
movies_path = "./Data/netflix_drop/netflix_movie.csv"
# 檢查是否是最新的check point
model_path = "./recommender_models/netflix_drop_25M.ckpt" 
data = pd.read_csv(data_csv_path)
data.drop(data[data.movieId == 6484].index, inplace=True)
movies = pd.read_csv(movies_path)
data.sort_values(by="date", inplace=True)

In [258]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [259]:
random.sample(list(grp_by_train.groups), k=10) # Test

[592858, 368068, 630887, 656511, 590694, 427504, 262952, 228321, 16083, 315725]

### Model Loading

In [260]:
model = Recommender(
        # vocab_size=len(data) + 2,
        vocab_size = len(movies) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [261]:
# movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping} # old
movie_to_idx = {a: b for a, b in zip(movies.title.tolist(), movies.movieId.tolist())}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

### Predict function

In [262]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data)) * 5

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred) # let the result's value in 1~5

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]], [masked_pred[i] for i in [movie_to_idx[a] for a in list_movies]]


### Output Results (old)

#### Senario 1: Adventure/Fantasy 

In [263]:
# list_movies = ["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
            #    "Harry Potter and the Chamber of Secrets (2002)",
            #    "Harry Potter and the Prisoner of Azkaban (2004)",
            #    "Harry Potter and the Goblet of Fire (2005)"]
list_movies=["Harry Potter and the Prisoner of Azkaban: Bonus Material",
            "Discovering the Real World of Harry Potter",
            "Harry Potter and the Chamber of Secrets",
            "Harry Potter and the Prisoner of Azkaban"]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['Taboo',
  "Pooh's Heffalump Movie",
  'Doctor Who: Lost in Time: The Patrick Troughton Years',
  "The Avengers '63",
  'Merci Docteur Rey',
  'Dancing at Lughnasa',
  'Meat Loaf: Hits Out of Hell',
  'Cruel Intentions 3',
  'Freeway',
  'The Rutles',
  'Three Coins in the Fountain',
  'Blood: The Last Vampire',
  'Fat Girl',
  'Coming Through',
  'Doctor Who: The Mind Robber',
  'The Bravados',
  'In the Time of the Butterflies',
  'Gone Dark',
  'Fame',
  'Shine',
  'Backlash',
  'Pressure',
  'Undesirable',
  'Freejack',
  'Practical Magic',
  'Charlie',
  "God's Little Acre",
  'Framed',
  'Drawing Blood'],
 [5.0,
  4.979372,
  4.9453244,
  4.8748364,
  4.8657513,
  4.840513,
  4.822454,
  4.8016105,
  4.789255,
  4.7774515,
  4.763953,
  4.7594995,
  4.75428,
  4.7518263,
  4.7284074,
  4.722892,
  4.7124248,
  4.7099824,
  4.7090144,
  4.70881,
  4.708641,
  4.707289,
  4.7065783,
  4.697815,
  4.6975937,
  4.692459,
  4.6886234,
  4.6880264,
  4.686418,
  4.6764503],
 [3.33834

#### Senario 2:  Action/Adventure

In [264]:
# list_movies = ["Black Panther (2017)",
#                "Avengers, The (2012)",
#                "Avengers: Infinity War - Part I (2018)",
#                "Logan (2017)",
#                "Spider-Man (2002)",
#                "Spider-Man 3 (2007)",
#                "Spider-Man: Far from Home (2019)"]
list_movies = ["Spider-Man: The Return of the Green Goblin",
               "Spider-Man",
               "Spider-Man: The Venom Saga",
               "Spider-Man 2: Bonus Material",
               "X-Men: Evolution: Season 2"
]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['Mr. Jones',
  'The Black Scorpion',
  'The Secret Garden',
  'The Man from Laramie',
  'Samurai Shodown',
  'Virtual Sexuality',
  'Dumbo',
  "In God's Hands",
  'Making the Misfits',
  'Fifteen Minutes',
  'The Outer Limits: The Original Series: Season 1',
  'Zone of the Enders: Idolo',
  'Mary Reilly',
  'The Gumball Rally',
  'Woodrow Wilson: American Experience',
  'Mail Call: The Best of Season 2',
  'Lies',
  'The Heroic Trio',
  'Pink Flamingos',
  'The Curse of the Bambino',
  'Basic Training',
  'Hardball',
  'Frazetta',
  'Witness for the Prosecution',
  'Pride FC: Pride Fighting Legacy: Vol. 2',
  'The Big Clock',
  'The Fog of War',
  'Poirot: Lord Edgware Dies',
  'Spider-Man: The New Animated Series: Season 1'],
 [5.0,
  4.9919395,
  4.833957,
  4.817832,
  4.8147645,
  4.8067164,
  4.777156,
  4.772595,
  4.7408276,
  4.738285,
  4.7366643,
  4.682145,
  4.660408,
  4.657145,
  4.6536317,
  4.6457357,
  4.6412263,
  4.6358356,
  4.6255383,
  4.6236935,
  4.6215096,
  

#### Senario 3: Comedy

In [265]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

#### Test Random userId

In [266]:
random_userId = random.choice(list(set(data.userId))) # Random choose an user
user_input = data[(data.userId == random_userId)]
if len(user_input) >= 511:
    user_input = user_input.sample(n = 511) # Random choose 511 rows
answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
final_eval['answer'] = answer
final_eval['eval'] = evaluation
final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
count, relevant = 0, 0
for idx, elm in final_eval.iterrows():
    if elm['answer'] >= 3.5: relevant += 1
    count += 1
    print(relevant/count, count)
    if count == 10: break

0.0 1
0.5 2
0.3333333333333333 3
0.5 4
0.6 5
0.6666666666666666 6
0.5714285714285714 7
0.5 8
0.5555555555555556 9
0.5 10


In [267]:
top_movie

['Sherlock Holmes: Dressed to Kill',
 'Dragon Ball: Red Ribbon Army Saga',
 'Trainspotting',
 'Iria: Zeiram the Animation',
 'Stealing Home',
 'A Man Called Sledge',
 'The Dream Is Alive: IMAX',
 'Europa Europa',
 'Reno 911: Season 1',
 'Brother to Brother',
 'The Paleface',
 'American Pimp',
 "Shintaro Katsu's Zatoichi: The Blind Swordsman",
 "Winnie the Pooh: ABC's",
 "Miss Spider's Sunny Patch Kids",
 "Breakin' All the Rules",
 'Megalodon',
 "Monty Python's Flying Circus",
 'Godannar',
 'Sammy Hagar: The Long Road to Cabo',
 'Top Fighter',
 'Wag the Dog',
 'Nightfall',
 'Gidget',
 'Defying Gravity',
 'Hi',
 'Dumbo',
 'Three Coins in the Fountain',
 'Door to Door',
 'Star Trek: The Next Generation: Season 4']

In [268]:
len(final_eval[final_eval.answer >= 3.5])

128

#### Test all userId

In [269]:
# k = 0 (useless) 1~10
precision_score = [0]*11 
recall_score = [0]*11
for userId in data.userId.unique():
    user_input = data[(data.userId == userId)]
    if len(user_input) > 10:
        if len(user_input) >= 511:
            user_input = user_input.sample(n = 511) # Random choose 511 rows
        answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
        user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
        top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
        final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
        final_eval['answer'] = answer
        final_eval['eval'] = evaluation
        final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
        recommend_item, relevant_item, relevant_recommend = 0, len(final_eval[final_eval.answer >= 3.5]), 0 # Precision分母 Recall分母 分子兩個一樣
        for idx, elm in final_eval.iterrows():
            if elm['answer'] >= 3.5: relevant_recommend += 1
            recommend_item += 1
            precision_score[recommend_item] += relevant_recommend / recommend_item
            if relevant_item>0:
                recall_score[recommend_item] += relevant_recommend / relevant_item
            if recommend_item == 10: break
        print(userId, 'is done!')

510180 is done!
122223 is done!
204439 is done!
404067 is done!
261295 is done!
355883 is done!
640999 is done!
153083 is done!
228192 is done!
369086 is done!
530721 is done!
371560 is done!
28966 is done!
422071 is done!
1086 is done!
472452 is done!
471064 is done!
556780 is done!
211864 is done!
505751 is done!
64765 is done!
263571 is done!
593765 is done!
589368 is done!
486399 is done!
13891 is done!
591582 is done!
155728 is done!
230300 is done!
633086 is done!
371596 is done!
10268 is done!
423167 is done!
314751 is done!
436424 is done!
605806 is done!
481816 is done!
174788 is done!
535765 is done!
293922 is done!
326912 is done!
117768 is done!
290371 is done!
108871 is done!
170451 is done!
305283 is done!
248730 is done!
316729 is done!
141370 is done!
209780 is done!
100197 is done!
311144 is done!
629203 is done!
394194 is done!
307098 is done!
614672 is done!
265351 is done!
488182 is done!
105774 is done!
69801 is done!
532653 is done!
70513 is done!
298689 is done!


#### Precision Score

In [270]:
print(len(data.userId.unique()))
for i in range(1,11):
    print("k = ", i, precision_score[i] / len(data.userId.unique())) # total value / total number of userId

115086
k =  1 0.6099786246806735
k =  2 0.6037962914689884
k =  3 0.6022568629835454
k =  4 0.6015740402829188
k =  5 0.6016352988201726
k =  6 0.6010707934356601
k =  7 0.6006067512245042
k =  8 0.6004357610830162
k =  9 0.6006513003804222
k =  10 0.6004596562570919


#### Recall Score

In [271]:
for i in range(1,11):
    print("k = ", i, recall_score[i] / len(data.userId.unique())) # total value / total number of userId

k =  1 0.017696252230459404
k =  2 0.03508446959190851
k =  3 0.052548655049305476
k =  4 0.07005535061045827
k =  5 0.08752246852555841
k =  6 0.10475778242736898
k =  7 0.12222239603541651
k =  8 0.13968025194071543
k =  9 0.15708007261604784
k =  10 0.17429959360657424


#### F1 Score

In [272]:
for i in range(1,11):
    prec_score = precision_score[i] / len(data.userId.unique())
    reca_score = recall_score[i] / len(data.userId.unique())
    print("k = ", i, (prec_score*reca_score/(prec_score+reca_score))*2) # total value / total number of userId

k =  1 0.034394671491897885
k =  2 0.06631557535892568
k =  3 0.09666316874993539
k =  4 0.12549623611352712
k =  5 0.1528143742982357
k =  6 0.1784196490633388
k =  7 0.20311188747154865
k =  8 0.22663749170794303
k =  9 0.2490337690719316
k =  10 0.27017392585160177


In [273]:
count / (len(data.userId.unique())*30) #old

2.8963847325767977e-06

In [274]:
len(data.userId.unique())

115086