In [5]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD


In [6]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


In [7]:
# 檔案路徑
data_csv_path = "../Data/netflix_disney_data_drop/disney_netflix_data.csv"
movies_path = "../Data/netflix_disney_data_drop/disney_netflix_movie.csv"
model_path = "../recommender_models/recommender-v3.ckpt"

In [8]:
data = pd.read_csv(data_csv_path)
movies = pd.read_csv(movies_path)


In [9]:
data.sort_values(by="timestamp", inplace=True)

In [10]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [11]:
random.sample(list(grp_by_train.groups), k=10)

[7156, 362, 3195, 21197, 4755, 5597, 11472, 14422, 20937, 15710]

In [12]:
model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [13]:
movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

In [14]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred)

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]]


### Senario 1: Adventure/Fantasy 

In [15]:
list_movies=["The Jungle Book",
            "The Many Adventures of Winnie the Pooh",
            "The Rescuers",
            "Cenerentola", 
            "Peter pan"]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['The Lion King',
  'Brave',
  'Alice nel paese delle meraviglie',
  'Blood of Beasts',
  'Alive',
  'Miss Congeniality',
  'Lilo & Stitch',
  'Aladdin',
  'Pretty Woman',
  'Building the Alaska Highway: American Experience',
  'What Women Want',
  '8 Seconds',
  'Mondo Cane',
  "Mr. Bill's Classics",
  'Carnivale: Season 1',
  'The Nightmare Room: Scareful What You Wish For',
  'Project Grizzly',
  'Mulholland Falls',
  'Erin Brockovich',
  'Calendar Girls',
  'The Aristocats'],
 [1.0,
  0.9776399,
  0.9239454,
  0.8733405,
  0.87032396,
  0.8470483,
  0.8380406,
  0.8251927,
  0.8194395,
  0.81777275,
  0.812615,
  0.80621266,
  0.7777672,
  0.76344454,
  0.7612582,
  0.7577692,
  0.75224173,
  0.7491829,
  0.7486822,
  0.74061954,
  0.7404131,
  0.7388033,
  0.7315656,
  0.73098475,
  0.7296771,
  0.72678375,
  0.72636575,
  0.7257877,
  0.7248146,
  0.72469])

### Senario 2:  Action/Adventure

In [16]:
list_movies = ["Spider-Man: The Return of the Green Goblin",
               "Spider-Man",
               "Spider-Man: The Venom Saga",
               "Spider-Man 2: Bonus Material",
               "X-Men: Evolution: Season 2"
]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['The Lion King',
  'Brave',
  'Blood of Beasts',
  'Alive',
  'Miss Congeniality',
  'Alice nel paese delle meraviglie',
  'Pretty Woman',
  'Building the Alaska Highway: American Experience',
  'The Nightmare Room: Scareful What You Wish For',
  'Cenerentola',
  'Carnivale: Season 1',
  '8 Seconds',
  'What Women Want',
  'Lilo & Stitch',
  'Mulholland Falls',
  'Outlaw Star: Vol. 3',
  'Sister Act',
  'Mondo Cane',
  'Project Grizzly',
  '8 Mile',
  'Beethoven: Symphony No. 9: Karajan',
  "Mr. Bill's Classics",
  'Metropolitan Opera: Puccini: La Boheme',
  'Aladdin'],
 [1.0,
  0.97936916,
  0.9047073,
  0.8807442,
  0.87377906,
  0.86950004,
  0.8687341,
  0.8596745,
  0.857658,
  0.8325094,
  0.8161482,
  0.81602925,
  0.80971724,
  0.8058413,
  0.80333745,
  0.80298126,
  0.80268896,
  0.80026317,
  0.7966689,
  0.7928927,
  0.7887156,
  0.78636754,
  0.7850207,
  0.7849662,
  0.7810287,
  0.7785607,
  0.77748674,
  0.7770872,
  0.7760158,
  0.77549934])

### Senario 3: Comedy

In [17]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

In [18]:
random_userId = random.choice(list(set(data.userId)))

In [19]:
user_input = list(data[(data.userId == random_userId) & (data.rating == 5)]['movieId'])
user_input = [idx_to_movie[a] for a in user_input if a in idx_to_movie]
top_movie, test = predict(user_input, model, movie_to_idx, idx_to_movie)
count = sum(map(lambda x: x >= 0.95, test))
user_input

['Toto: 25th Anniversary: Live in Amsterdam',
 'The Eighteenth Angel',
 'Emma (Miniseries)',
 'The Cannonball Run II',
 'Close to Leo',
 'MTV Unplugged: Mana',
 'The Thin Blue Lie',
 'Airwolf: Season 1',
 "What's Happening!!: Season 2",
 'Sherlock Holmes and the Voice of Terror',
 'Circuit',
 'The French Connection II']

In [20]:
count = 0
for userId in data.userId.unique():
    user_input = list(data[(data.userId == userId) & (data.rating == 5)]['movieId'])
    user_input = [idx_to_movie[a] for a in user_input if a in idx_to_movie]
    if len(user_input) == 0:
        print(userId)
        continue
    elif len(user_input) >= 512:
        user_input = user_input[:511]
    top_movie, test = predict(user_input, model, movie_to_idx, idx_to_movie)
    count += sum(map(lambda x: x >= 0.95, test))
    

303
80
885
645
1574
2227
761
1077
2597
830
394
7548
2834
1182
973
836
1594
1509
1564
1055
407
2860
127
2579
997
1075
264
451
474
2830
817
260
2833
319
2854
2389
236
2318
1554
297
11580
1565
6979
2055
2460
234
6694
4078
3881
15699
3932
4018
17001
4005
4065
2148
4342
4522
222
4134
2763
45
571
4420
4161
1416
1100
727
1595
1347
1050
2454
4449
14531
4001
1065
359
800
3443
1562
1054
4003
14974
10996
27155
2395
2753
9018
8640
10459
504
1597
2346
1512
511
3174
3954
14691
4168
4047
17268
4178
2383
3960
16736
2894
3702
1338
220
12670
23670
11212
18094
2781
19320
24982
18297
17934
5112
14296
13676
19398
13725
24768
5031
21592
22864
12491
7049
19132
7786
21084
9580
25028
9270
7869
3263
5966
14194
1423
19032
19592
3060
15230
3010
347
11050
9005
56
19018
6340
27281
1726
15867
6984
6018
8832
3611
3501
25313
3290
8185
23463
10422
8439
17707
763
9872
12702
203
1578
12915
14998
27169
1343
2377
10896
21174
3937
16443
25771
9186
21738
20181
20203
2791
2853
17082
413
692
8249
7699
3784
262
19422
3024
5516


In [21]:
count

77364

In [22]:
count / (len(data.userId.unique())*30)

0.48310228550018736

In [23]:
len(data.userId.unique())

5338

In [24]:
# for movie in user_input:
# movies[(movies.movieId==14621)]['title']