### Import Packages

In [47]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD

### Check GPU is available

In [48]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


### Data processing

In [49]:
# 檔案路徑
data_csv_path = "./Data/disney_netflix/disney_netflix_data.csv"
movies_path = "./Data/disney_netflix/disney_netflix_movie.csv"
# 檢查是否是最新的check point
model_path = "./recommender_models/disney_netflix.ckpt" 
data = pd.read_csv(data_csv_path)
data.drop(data[data.movieId == 6484].index, inplace=True)
movies = pd.read_csv(movies_path)
data.sort_values(by="date", inplace=True)

In [50]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [51]:
random.sample(list(grp_by_train.groups), k=10) # Test

[220987, 64176, 540844, 498128, 36487, 87558, 173128, 623220, 59152, 627618]

### Model Loading

In [52]:
model = Recommender(
        # vocab_size=len(data) + 2,
        vocab_size = len(movies) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [53]:
# movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping} # old
# movie_to_idx = {a: b for a, b in zip(movies.title.tolist(), movies.movieId.tolist())}
movie_to_idx = {}
for a, b in zip(movies.title.tolist(), movies.movieId.tolist()):
    if a in movie_to_idx:
        movie_to_idx[f'{a}_{b}'] = b
    else:
        movie_to_idx[a] = b
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

### Predict function

In [54]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data)) * 5

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred) # let the result's value in 1~5

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]], [masked_pred[i] for i in [movie_to_idx[a] for a in list_movies]]


### Output Results (old)

#### Senario 1: Adventure/Fantasy 

In [55]:
# list_movies = ["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
            #    "Harry Potter and the Chamber of Secrets (2002)",
            #    "Harry Potter and the Prisoner of Azkaban (2004)",
            #    "Harry Potter and the Goblet of Fire (2005)"]
list_movies=["Harry Potter and the Prisoner of Azkaban: Bonus Material",
            "Discovering the Real World of Harry Potter",
            "Harry Potter and the Chamber of Secrets",
            "Harry Potter and the Prisoner of Azkaban"]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['Easy Riders',
  'Star Trek: Deep Space Nine: Season 6',
  'The Skulls III',
  'Spider-Man 2',
  'Face the Evil',
  'Hero_12690',
  'The Journey of Natty Gann',
  'Troy',
  "Thieves' Highway",
  'Kati Patang',
  "ECW: Heatwave '98",
  'The Collectors',
  'Memphis Belle',
  'David Copperfield: Illusion',
  'Tremors 3: Back to Perfection',
  'North Shore',
  'Seabiscuit: The Lost Documentary',
  'Monster-in-Law: Bonus Material',
  "Without You I'm Nothing",
  'The Firm Body Sculpting: Cardio Sculpt',
  "Heck's Way Home",
  'Kasoor',
  'House of Flying Daggers',
  "Cadfael: Monk's Hood",
  'Stir of Echoes',
  'Flight of the Innocent',
  'Legend of 1900',
  "Kiki's Delivery Service",
  'The Twilight Zone: Vol. 8',
  'The Knights Templar'],
 [4.995612,
  4.9203835,
  4.901217,
  4.9003778,
  4.899376,
  4.8736897,
  4.868419,
  4.8435683,
  4.8298893,
  4.8250055,
  4.8233895,
  4.8193016,
  4.813305,
  4.8065615,
  4.805253,
  4.7948794,
  4.786014,
  4.783117,
  4.7800174,
  4.7705135,


#### Senario 2:  Action/Adventure

In [56]:
# list_movies = ["Black Panther (2017)",
#                "Avengers, The (2012)",
#                "Avengers: Infinity War - Part I (2018)",
#                "Logan (2017)",
#                "Spider-Man (2002)",
#                "Spider-Man 3 (2007)",
#                "Spider-Man: Far from Home (2019)"]
list_movies = ["Spider-Man: The Return of the Green Goblin",
               "Spider-Man",
               "Spider-Man: The Venom Saga",
               "Spider-Man 2: Bonus Material",
               "X-Men: Evolution: Season 2"
]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['Aladdin',
  'Brave',
  'North Shore',
  'Tremors 3: Back to Perfection',
  'Godzilla vs. Destroyah / Godzilla vs. Space Godzilla (Double Feature)',
  'Lilo & Stitch',
  'Suburban Commando',
  "The Emperor's New Groove_17786",
  'Tangled_17789',
  'Men in Black',
  'Poor White Trash',
  'Twister_12470',
  'Heartwood',
  'La Dolce Vita',
  'The Little Mermaid_17780',
  'DC 9/11: Time of Crisis',
  'U2: Rattle and Hum',
  'Troy',
  'Memphis Belle',
  'Space Station: IMAX',
  'Top Cat: The Complete Series',
  'Dorothy L. Sayers Mysteries: Have His Carcase',
  'Millennium Actress',
  'Anna Karenina_15870',
  'Jurassic Park',
  'Forrest Gump',
  'Mulholland Falls',
  'Mrs. Winterbourne',
  'No Quarter: Page & Plant Unledded',
  'Peter pan'],
 [5.0,
  4.8567266,
  4.8395243,
  4.8107247,
  4.771454,
  4.724417,
  4.681031,
  4.664945,
  4.6480184,
  4.64154,
  4.6388106,
  4.6046085,
  4.5944595,
  4.583391,
  4.5824413,
  4.5814576,
  4.5784183,
  4.575936,
  4.571599,
  4.5631976,
  4.55

#### Senario 3: Comedy

In [57]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

#### Test Random userId

In [58]:
random_userId = random.choice(list(set(data.userId))) # Random choose an user
user_input = data[(data.userId == random_userId)]
if len(user_input) >= 511:
    user_input = user_input.sample(n = 511) # Random choose 511 rows
answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
final_eval['answer'] = answer
final_eval['eval'] = evaluation
final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
count, relevant = 0, 0
for idx, elm in final_eval.iterrows():
    if elm['answer'] >= 3.5: relevant += 1
    count += 1
    print(relevant/count, count)
    if count == 10: break

1.0 1
0.5 2
0.6666666666666666 3
0.5 4
0.6 5
0.5 6
0.42857142857142855 7
0.375 8
0.3333333333333333 9
0.4 10


In [59]:
top_movie

['Suburban Commando',
 'To Wong Foo',
 'La Strada: Special Edition',
 'Watching You',
 'Sword of the Valiant',
 'Steamboy',
 'Guncrazy',
 'X-Men: Evolution: Season 2',
 'Song of the Thin Man',
 'Ghostbusters 2',
 'Orphans of the Storm',
 'Summer of the Monkeys',
 'Poor White Trash',
 'Operation Condor 2: Armour of the Gods',
 'Naach',
 'The Undefeated',
 'The People That Time Forgot',
 'Laguna Beach: Season 1',
 'The Deep End',
 'Ginger and Cinnamon',
 'The Journey of Natty Gann',
 'Bewitched: Season 2',
 'Chill Factor',
 'Friday Night Lights',
 'The Three Stooges: Three Stooges in History',
 'Homicidal',
 'The Crocodile Hunter: Croc Files',
 'Foolish Wives / The Man You Loved to Hate: Double Feature',
 'Little House on the Prairie: Season 2',
 'Another Thin Man']

In [60]:
len(final_eval[final_eval.answer >= 3.5])
len(data.userId.unique())
data.userId.unique()[:20]
data[data.userId == 371560]

Unnamed: 0,movieId,userId,rating,date,movieId_mapped
14066958,14869,371560,4,1999-12-30,14867
14066861,16182,371560,4,1999-12-30,16180
14066845,16068,371560,4,1999-12-30,16066
14066822,7165,371560,2,1999-12-30,7165
14066808,7208,371560,3,1999-12-30,7208
...,...,...,...,...,...
14066722,4865,371560,3,2005-10-18,4866
14066818,16360,371560,3,2005-10-18,16358
14066817,5522,371560,4,2005-11-14,5523
14066887,8524,371560,4,2005-11-16,8524


#### Test all userId

In [61]:
# k = 0 (useless) 1~10
precision_score = [0]*11 
recall_score = [0]*11
for userId in data.userId.unique():
    user_input = data[(data.userId == userId)]
    print(len(user_input))
    if len(user_input) >= 511:
        user_input = user_input.sample(n = 511) # Random choose 511 rows
    answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
    user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
    top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
    final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
    final_eval['answer'] = answer
    final_eval['eval'] = evaluation
    final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
    recommend_item, relevant_item, relevant_recommend = 0, len(final_eval[final_eval.answer >= 3.5]), 0 # Precision分母 Recall分母 分子兩個一樣
    for idx, elm in final_eval.iterrows():
        if elm['answer'] >= 3.5: relevant_recommend += 1
        recommend_item += 1
        precision_score[recommend_item] += relevant_recommend / recommend_item
        if relevant_item>0:
            recall_score[recommend_item] += relevant_recommend / relevant_item
        if recommend_item == 10: break
    print(userId, 'is done!')

63
466 is done!
2
303 is done!
3
64 is done!
136
314 is done!
954
510180 is done!
361
122223 is done!
865
204439 is done!
140
261295 is done!
508
404067 is done!
563
355883 is done!
320
153083 is done!
40
640999 is done!
891
422071 is done!
317
371560 is done!
975
530721 is done!
143
369086 is done!
395
28966 is done!
210
228192 is done!
1590
472452 is done!
486
556780 is done!
1370
471064 is done!
720
211864 is done!
575
505751 is done!
483
1086 is done!
5
80 is done!
6
885 is done!
2
645 is done!
2
2564 is done!
15
101 is done!
2
649 is done!
2
1264 is done!
124
1485 is done!
2
1574 is done!
42
1561 is done!
2
2227 is done!
2
761 is done!
628
684 is done!
2
1077 is done!
2
2597 is done!
88
593765 is done!
1088
263571 is done!
1056
486399 is done!
1920
64765 is done!
386
589368 is done!
1471
423167 is done!
1050
535765 is done!
470
481816 is done!
1092
436424 is done!
766
633086 is done!
406
155728 is done!
96
591582 is done!
1131
13891 is done!
451
230300 is done!
244
314751 is done!

#### Precision Score

In [62]:
for i in range(1,11):
    print("k = ", i, precision_score[i] / len(data.userId.unique())) # total value / total number of userId

k =  1 0.6382886813481127
k =  2 0.6345477636569414
k =  3 0.6280681140763333
k =  4 0.6246123009667637
k =  5 0.6216852272631183
k =  6 0.6193061879213764
k =  7 0.6174945601798906
k =  8 0.6163388683939972
k =  9 0.6154495964860976
k =  10 0.6145239185465088


#### Recall Score

In [63]:
for i in range(1,11):
    print("k = ", i, recall_score[i] / len(data.userId.unique())) # total value / total number of userId

k =  1 0.019776300922584065
k =  2 0.039284888208615566
k =  3 0.05461735611005504
k =  4 0.0709573679822658
k =  5 0.08759166686403909
k =  6 0.10416511661985124
k =  7 0.12084666198944972
k =  8 0.13758060325553453
k =  9 0.154254228531029
k =  10 0.17109506182712292


#### F1 Score

In [64]:
for i in range(1,11):
    prec_score = precision_score[i] / len(data.userId.unique())
    reca_score = recall_score[i] / len(data.userId.unique())
    print("k = ", i, (prec_score*recall_score/(prec_score+recall_score))*2) # total value / total number of userId

TypeError: can't multiply sequence by non-int of type 'float'

In [None]:
count / (len(data.userId.unique())*30) #old

2.879844258022526e-06

In [None]:
len(data.userId.unique())

115747