In [85]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD


In [86]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


In [87]:
# 檔案路徑
data_csv_path = "../Data/original_dataset/disney/disney_data.csv"
movies_path = "../Data/original_dataset/disney/disney_movie.csv"
model_path = "../recommender_models/recommender-v6.ckpt"

In [88]:
data = pd.read_csv(data_csv_path)
movies = pd.read_csv(movies_path)


In [89]:
data.sort_values(by="timestamp", inplace=True)

In [90]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [91]:
random.sample(list(grp_by_train.groups), k=10)

[253, 641, 3057, 1785, 2, 4765, 5031, 4801, 203, 1160]

In [92]:
model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [93]:
movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

In [94]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred)

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]]


### Senario 1: Adventure/Fantasy 

In [95]:
list_movies=["The Jungle Book",
            "The Many Adventures of Winnie the Pooh",
            "The Rescuers",
            "Cenerentola", 
            "Peter pan"]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['Spirited Away',
  'Frozen',
  'The Lion King',
  'Lilo & Stitch',
  'Brave',
  'Tangled',
  'Aladdin',
  'Tron',
  'Mary Poppins',
  "The Emperor's New Groove",
  'The Little Mermaid',
  'Hercules',
  'Alice nel paese delle meraviglie',
  'Oliver & Company',
  'The Aristocats',
  'La Spada nella Roccia'],
 [1.0,
  0.96831423,
  0.92683405,
  0.8578233,
  0.8269952,
  0.8205167,
  0.7618849,
  0.7566088,
  0.7405396,
  0.72100204,
  0.720714,
  0.7038174,
  0.67914766,
  0.60724,
  0.6055815,
  0.5533501])

### Senario 2:  Action/Adventure

In [96]:
# list_movies = ["Spider-Man: The Return of the Green Goblin",
#                "Spider-Man",
#                "Spider-Man: The Venom Saga",
#                "Spider-Man 2: Bonus Material",
#                "X-Men: Evolution: Season 2"
# ]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Senario 3: Comedy

In [97]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

In [98]:
random_userId = random.choice(list(set(data.userId)))

In [99]:
user_input = list(data[(data.userId == random_userId) & (data.rating == 5)]['movieId'])
user_input = [idx_to_movie[a] for a in user_input if a in idx_to_movie]
top_movie, test = predict(user_input, model, movie_to_idx, idx_to_movie)
count = sum(map(lambda x: x >= 0.95, test))
user_input

['The Rescuers']

In [100]:
count = 0
for userId in data.userId.unique():
    user_input = list(data[(data.userId == userId) & (data.rating == 5)]['movieId'])
    user_input = [idx_to_movie[a] for a in user_input if a in idx_to_movie]
    if len(user_input) == 0:
        print(userId)
        continue
    elif len(user_input) >= 512:
        user_input = user_input[:511]
    top_movie, test = predict(user_input, model, movie_to_idx, idx_to_movie)
    count += sum(map(lambda x: x >= 0.95, test))
    

1334
2075
1250
1252
1389
944
688
2557
2073
1449
303
646
2547
1532
1000
960
1244
2581
899
885
645
2545
822
473
617
1216
441
1074
181
1169
2314
23
2418
2016
2546
565
2129
2615
182
1485
2468
992
2196
1147
1204
659
1563
1145
2356
2577
1561
2575
2619
2553
2556
2361
1484
2608
2227
2012
1223
1144
1208
204
2609
684
2298
2387
1306
2597
2337
1904
830
149
1243
424
394
1040
2270
193
806
1143
47
1285
2684
2739
2834
2657
2572
1555
2819
2849
2688
2742
2827
2279
708
1090
1182
8
2522
435
836
301
1594
2067
897
150
194
630
452
2015
423
148
2304
589
1193
2574
2580
2364
2343
1181
1564
564
2423
491
1609
942
2750
2682
407
2864
2789
2689
2824
2772
2703
2787
1233
2813
1251
539
127
1158
2690
2765
1222
2805
2729
2691
785
2767
2579
2471
1901
6
207
2467
1210
1133
1167
275
549
1837
2776
468
2679
2658
998
2487
2687
1602
2349
247
166
264
588
451
2563
2756
160
2435
474
1274
2650
2862
2738
495
2811
2585
2825
169
2701
2814
371
863
319
1263
2806
2799
2583
2781
2831
2758
2702
1310
1982
236
1554
857
2716
2843
3358
2719
265

In [101]:
count

6550

In [102]:
count / (len(data.userId.unique())*30)

0.03572792232586047

In [103]:
len(data.userId.unique())

6111

In [104]:
# for movie in user_input:
# movies[(movies.movieId==14621)]['title']