In [1]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


In [3]:
# 檔案路徑
data_csv_path = "../Data/netflix_prize_dataset/netflix_data_25M.csv"
movies_path = "../Data/netflix_prize_dataset/netflix_movie.csv"
model_path = "../recommender_models/recommender-v4.ckpt"

In [4]:
data = pd.read_csv(data_csv_path)
movies = pd.read_csv(movies_path)


In [5]:
data.sort_values(by="timestamp", inplace=True)

In [6]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [7]:
random.sample(list(grp_by_train.groups), k=10)

[11867, 1897, 5836, 15916, 14346, 24929, 550, 266, 26679, 7951]

In [8]:
model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [9]:
movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

In [10]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred)

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]]


### Senario 1: Adventure/Fantasy 

In [11]:
# list_movies=["The Jungle Book",
#             "The Many Adventures of Winnie the Pooh",
#             "The Rescuers",
#             "Cenerentola", 
#             "Peter pan"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Senario 2:  Action/Adventure

In [12]:
list_movies = ["Spider-Man: The Return of the Green Goblin",
               "Spider-Man",
               "Spider-Man: The Venom Saga",
               "Spider-Man 2: Bonus Material",
               "X-Men: Evolution: Season 2"
]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['The Mouse on the Moon',
  'Peacemaker',
  'Fudoh: The New Generation',
  'Nelly and Monsieur Arnaud',
  'Time Code',
  "The Odyssey of Life: The Photographer's Secrets: Nova",
  'P.D. James: The Black Tower',
  "Violet's Visit",
  'A Decade Under the Influence',
  'Passion of Mind',
  'The Broken Hearts Club',
  'Iron Chef USA',
  'Fushigi Yugi: The Mysterious Play: Eikoden',
  'If I Die Before I Wake',
  "Fat Albert's Greatest Hits: The Ultimate Collection",
  'The Golden Bowl',
  'Lupin the 3rd: Farewell to Nostradamus',
  'El Dorado',
  'Rugrats All Grown Up: All Grown Up ... and Loving It!',
  'Korn: Deuce',
  'Embrace the Darkness 3',
  'Gilgamesh',
  "That's the Way I Like It",
  'Rakht',
  'Free Tibet',
  "The Hitchhiker's Guide to the Galaxy",
  'Moon Warriors',
  'A Cry in the Wild',
  'Cover Up',
  'Sherlock Holmes'],
 [1.0,
  0.99275494,
  0.9842185,
  0.9839769,
  0.97496986,
  0.9729098,
  0.9698894,
  0.96561843,
  0.9607576,
  0.9602094,
  0.95900613,
  0.9557531,
  0

### Senario 3: Comedy

In [13]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

In [14]:
random_userId = random.choice(list(set(data.userId)))

In [15]:
user_input = list(data[(data.userId == random_userId) & (data.rating == 5)]['movieId'])
user_input = [idx_to_movie[a] for a in user_input if a in idx_to_movie]
top_movie, test = predict(user_input, model, movie_to_idx, idx_to_movie)
count = sum(map(lambda x: x >= 0.95, test))
user_input

['The Twilight Zone: Vol. 37',
 'Digging to China',
 'Only You',
 'Tycoon: A New Russian',
 "Devil's Playground",
 'The Indian in the Cupboard']

In [16]:
count = 0
for userId in data.userId.unique():
    user_input = list(data[(data.userId == userId) & (data.rating == 5)]['movieId'])
    user_input = [idx_to_movie[a] for a in user_input if a in idx_to_movie]
    if len(user_input) == 0:
        print(userId)
        continue
    elif len(user_input) >= 512:
        user_input = user_input[:511]
    top_movie, test = predict(user_input, model, movie_to_idx, idx_to_movie)
    count += sum(map(lambda x: x >= 0.95, test))
    

7548
997
15509
11580
6979
6694
15699
17001
4522
14531
14974
10996
27155
2753
6190
9018
8640
10459
1512
3174
14691
17268
16736
2894
12670
23670
11212
18094
2781
19320
24982
3941
18297
17934
5112
14296
13676
13725
24768
19398
5031
21592
22864
2913
12491
7049
13894
19132
7786
21084
9580
25028
9270
3263
7869
5966
14194
19032
19592
3060
15230
3010
11050
5127
9005
19018
6340
27281
6018
6984
1726
15867
8832
3611
3501
3290
23463
8185
25313
10422
8439
17707
9872
1520
12702
12915
8464
14998
27169
10896
23811
21174
3937
16443
25771
9186
20181
20203
17082
413
692
8249
7699
3784
19422
3024
5516
20522
17190
3741
13945
6900
4658
2242
25171
18230
9128
21201
1933
13190
8270
12173
22334
2276
26506
23405
7009


In [17]:
count

70319

In [18]:
count / (len(data.userId.unique())*30)

0.49892862210869876

In [19]:
len(data.userId.unique())

4698

In [20]:
# for movie in user_input:
# movies[(movies.movieId==14621)]['title']