In [41]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD


In [42]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


In [43]:
# 檔案路徑
data_csv_path = "../Data/disney_review_dataset/disney_data.csv"
movies_path = "../Data/disney_review_dataset/disney_movie.csv"
model_path = "../recommender_models/recommender-v5.ckpt"

In [44]:
data = pd.read_csv(data_csv_path)
movies = pd.read_csv(movies_path)


In [45]:
data.sort_values(by="timestamp", inplace=True)

In [46]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [47]:
random.sample(list(grp_by_train.groups), k=10)

[512, 405, 4673, 42, 821, 1564, 2358, 700, 3803, 746]

In [48]:
model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [49]:
movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

In [50]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred)

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]]


### Senario 1: Adventure/Fantasy 

In [51]:
list_movies=["The Jungle Book",
            "The Many Adventures of Winnie the Pooh",
            "The Rescuers",
            "Cenerentola", 
            "Peter pan"]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['The Lion King',
  'Frozen',
  'Aladdin',
  'Brave',
  'Mary Poppins',
  'The Little Mermaid',
  'Spirited Away',
  'Tangled',
  'La Spada nella Roccia',
  'Lilo & Stitch',
  'The Aristocats',
  'Hercules',
  'Alice nel paese delle meraviglie',
  "The Emperor's New Groove",
  'Tron',
  'Oliver & Company'],
 [1.0,
  0.9710052,
  0.8980285,
  0.88993794,
  0.87414,
  0.8503206,
  0.82987887,
  0.8160051,
  0.7623366,
  0.7595384,
  0.7440562,
  0.72997916,
  0.6719828,
  0.6716829,
  0.6304037,
  0.62849647])

### Senario 2:  Action/Adventure

In [52]:
# list_movies = ["Spider-Man: The Return of the Green Goblin",
#                "Spider-Man",
#                "Spider-Man: The Venom Saga",
#                "Spider-Man 2: Bonus Material",
#                "X-Men: Evolution: Season 2"
# ]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Senario 3: Comedy

In [53]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

In [54]:
random_userId = random.choice(list(set(data.userId)))

In [55]:
user_input = list(data[(data.userId == random_userId) & (data.rating == 5)]['movieId'])
user_input = [idx_to_movie[a] for a in user_input if a in idx_to_movie]
top_movie, test = predict(user_input, model, movie_to_idx, idx_to_movie)
count = sum(map(lambda x: x >= 0.95, test))
user_input

['Lilo & Stitch', 'Tron']

In [56]:
count = 0
for userId in data.userId.unique():
    user_input = list(data[(data.userId == userId) & (data.rating == 5)]['movieId'])
    user_input = [idx_to_movie[a] for a in user_input if a in idx_to_movie]
    if len(user_input) == 0:
        print(userId)
        continue
    elif len(user_input) >= 512:
        user_input = user_input[:511]
    top_movie, test = predict(user_input, model, movie_to_idx, idx_to_movie)
    count += sum(map(lambda x: x >= 0.95, test))
    

303
885
645
1485
1561
2227
684
2597
830
394
2834
1090
1182
836
301
1594
1564
491
407
127
785
2579
2467
2487
247
264
451
474
495
319
236
1554
1565
2305
234
3881
4065
222
45
571
4420
4161
1100
727
1595
1050
4449
1582
1065
359
1562
1054
4003
504
2346
511
3954
4103
265
4178
1331
268
3702
1338
220
347
56
203
87
2853
262
489
209
567
388
1082
389
219
1512
162
228
643
434
414
235
217
705
762
4469
786
227
771
253
1350
1335
503
957
361
2620
1358
777
409
32
778
1603
2488
1319
772
199
734
37
4778
4844
1337
1307
125
1366
4676
271
4837
2840
4843
4729
128
413
259
4140
4810
2622
4833
4824
4842
4175
5121
4172
1286
5196
5170
5183
5201
5087
5050
5148
512
507
5186
5199
100
358
62
502
912
4399
4832
787
189
839
122
4929
4113
4960
665
4724
700
2379
880
1352
2861
5195
286
2616
5192
270
4979
756
826
5101
42
1070
2858
4571
4704
911
1019
229
1064
363
4430
879
41
2614
5006
57
877
305
2485
126
2322
348
993
4743
1621
67
1087
460
5091
24
1573
5488
115
730
2868
4122
244
1414
4755
698
4746
404
2352
482
837
1038
77
760

In [57]:
count

1041

In [58]:
count / (len(data.userId.unique())*30)

0.04242053789731051

In [59]:
len(data.userId.unique())

818

In [60]:
# for movie in user_input:
# movies[(movies.movieId==14621)]['title']