In [21]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD


In [22]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


In [23]:
# 檔案路徑
data_csv_path = "../Data/netflix_disney_data/disney_netflix_data.csv"
movies_path = "../Data/netflix_disney_data/disney_netflix_movie.csv"
model_path = "../recommender_models/recommender-v3.ckpt"

In [24]:
data = pd.read_csv(data_csv_path)
movies = pd.read_csv(movies_path)


In [25]:
data.sort_values(by="timestamp", inplace=True)

In [26]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [27]:
random.sample(list(grp_by_train.groups), k=10)

[10200, 7953, 24612, 24679, 16770, 27316, 26101, 21368, 1768, 22321]

In [28]:
model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [29]:
movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

In [30]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred)

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]]


### Senario 1: Adventure/Fantasy 

In [31]:
list_movies=["The Jungle Book",
            "The Many Adventures of Winnie the Pooh",
            "The Rescuers",
            "Cenerentola", 
            "Peter pan"]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['The Lion King',
  'Brave',
  'Alice nel paese delle meraviglie',
  "That's the Way I Like It",
  'A Decade Under the Influence',
  'Fudoh: The New Generation',
  'Lilo & Stitch',
  'Aladdin',
  'Iron Chef USA',
  'Passion of Mind',
  "The Hitchhiker's Guide to the Galaxy",
  "Fat Albert's Greatest Hits: The Ultimate Collection",
  'Muscle Beach Party / Ski Party: Double Feature',
  'Sakura Wars',
  'El Dorado',
  'Rakht',
  'Bringing Down the House',
  "Violet's Visit",
  'Peacemaker',
  'Hour of the Gun',
  'Strangeland'],
 [1.0,
  0.9780774,
  0.9263424,
  0.87282556,
  0.8717331,
  0.8478127,
  0.83371997,
  0.8198264,
  0.81590134,
  0.812328,
  0.8109041,
  0.80554277,
  0.77557266,
  0.7628181,
  0.7540333,
  0.7531365,
  0.74983454,
  0.74476,
  0.7423789,
  0.73696965,
  0.7362009,
  0.7310387,
  0.72501147,
  0.72376364,
  0.72338134,
  0.72260255,
  0.722317,
  0.7222721,
  0.72129035,
  0.72127366])

### Senario 2:  Action/Adventure

In [32]:
# list_movies = ["Spider-Man: The Return of the Green Goblin",
#                "Spider-Man",
#                "Spider-Man: The Venom Saga",
#                "Spider-Man 2: Bonus Material",
#                "X-Men: Evolution: Season 2"
# ]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Senario 3: Comedy

In [33]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

In [34]:
random_userId = random.choice(list(set(data.userId)))

In [35]:
user_input = list(data[(data.userId == random_userId) & (data.rating == 5)]['movieId'])
user_input = [idx_to_movie[a] for a in user_input if a in idx_to_movie]
top_movie, test = predict(user_input, model, movie_to_idx, idx_to_movie)
count = sum(map(lambda x: x >= 0.95, test))
user_input

['Liberty! The American Revolution',
 'Scooby-Doo Goes Hollywood',
 'Inu-Yasha: The Movie 2: The Castle Beyond the Looking Glass',
 'Le Mans',
 'The Will Smith Music Video Collection',
 'Reborn from Hell',
 'Our Mutual Friend',
 'Man on Fire',
 'The Guess Who: Running Back Thru Canada',
 'ER: Season 1',
 'The Falcon and the Snowman',
 'Dragon Family',
 'Operation Dumbo Drop',
 'Scooby-Doo and the Reluctant Werewolf',
 'Kickboxer',
 'Frankie and Hazel',
 'Read-Along: Beauty and the Beast',
 'The Jamie Kennedy Experiment: Season 2',
 'The Amityville Horror',
 "Horatio's Drive: America's First Road Trip",
 'Rockets Redglare',
 'The Late Show',
 'Blue Juice',
 'Restoration',
 'White Fang',
 'Street Smart',
 'The Real Bruce Lee',
 'Ginger Snaps',
 'Simply Irresistible',
 'Bollywood and Vine',
 'Deadline Auto Theft / Gone in 60 Seconds 2',
 'Mystery Science Theater 3000: Beginning of the End',
 'Assault of the Killer Bimbos',
 'Manic',
 'Mermaid Forest',
 'Men With Brooms',
 'Scorpio',
 'Sup

In [36]:
count = 0
for userId in data.userId.unique():
    user_input = list(data[(data.userId == userId) & (data.rating == 5)]['movieId'])
    user_input = [idx_to_movie[a] for a in user_input if a in idx_to_movie]
    if len(user_input) == 0:
        print(userId)
        continue
    elif len(user_input) >= 512:
        user_input = user_input[:511]
    top_movie, test = predict(user_input, model, movie_to_idx, idx_to_movie)
    count += sum(map(lambda x: x >= 0.95, test))
    

303
885
645
2227
761
2597
830
394
7548
2834
1182
973
836
1594
1491
1564
232
407
2860
127
2579
997
1075
264
451
474
2830
817
260
2833
319
2854
2389
236
1554
297
11580
991
1565
6979
2055
234
6694
4078
3881
15699
3932
4018
17001
4005
4065
4342
4522
222
4134
2763
45
571
4420
4161
1416
1100
727
1595
1347
1050
4449
14531
4001
1065
359
800
3443
1562
1552
1054
4003
14974
10996
27155
2395
2753
6190
9018
8640
10459
504
2346
1512
511
3174
3954
14691
4168
4047
17268
4178
2383
3960
16736
2894
3702
1338
220
12670
23670
11212
18094
2781
19320
24982
3941
18297
17934
5112
14296
13676
19398
13725
24768
5031
21592
22864
2913
12491
7049
13894
19132
7786
21084
9580
25028
9270
7869
3263
5966
14194
1423
19032
19592
3060
15230
3010
347
11050
9005
56
19018
6340
27281
1726
15867
6984
6018
8832
3611
3501
25313
3290
8185
23463
10422
8439
17707
25084
763
9872
12702
203
12915
14998
27169
1343
23811
10896
323
21174
3937
16443
25771
9186
20181
20203
2791
2853
17082
413
692
8249
7699
3784
262
19422
3024
5516
20522
171

In [37]:
count

76695

In [38]:
count / (len(data.userId.unique())*30)

0.47892469089546646

In [39]:
len(data.userId.unique())

5338

In [40]:
# for movie in user_input:
# movies[(movies.movieId==14621)]['title']