### Import Packages

In [94]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD

### Check GPU is available

In [95]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


### Data processing

In [96]:
# 檔案路徑
data_csv_path = "./netflix_prize_dataset/netflix_data_25M_drop.csv"
movies_path = "./netflix_prize_dataset/netflix_movie2.csv"
# 檢查是否是最新的check point
model_path = "./recommender_models/recommender.ckpt" 
data = pd.read_csv(data_csv_path)
data.drop(data[data.movieId == 6484].index, inplace=True)
movies = pd.read_csv(movies_path)
data.sort_values(by="date", inplace=True)

In [97]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [98]:
random.sample(list(grp_by_train.groups), k=10) # Test

[594552, 479046, 97025, 534940, 410626, 341961, 234923, 102512, 572591, 639555]

### Model Loading

In [99]:
model = Recommender(
        # vocab_size=len(data) + 2,
        vocab_size = len(movies),
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

RuntimeError: Error(s) in loading state_dict for Recommender:
	size mismatch for item_embeddings.weight: copying a param with shape torch.Size([17770, 128]) from checkpoint, the shape in current model is torch.Size([17769, 128]).
	size mismatch for linear_out.weight: copying a param with shape torch.Size([17770, 128]) from checkpoint, the shape in current model is torch.Size([17769, 128]).
	size mismatch for linear_out.bias: copying a param with shape torch.Size([17770]) from checkpoint, the shape in current model is torch.Size([17769]).

In [None]:
# movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping} # old
movie_to_idx = {a: b for a, b in zip(movies.title.tolist(), movies.movieId.tolist())}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

### Predict function

In [None]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data)) * 5

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred) # let the result's value in 1~5

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]], [masked_pred[i] for i in [movie_to_idx[a] for a in list_movies]]


### Output Results (old)

#### Senario 1: Adventure/Fantasy 

In [14]:
# list_movies = ["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
            #    "Harry Potter and the Chamber of Secrets (2002)",
            #    "Harry Potter and the Prisoner of Azkaban (2004)",
            #    "Harry Potter and the Goblet of Fire (2005)"]
list_movies=["Harry Potter and the Prisoner of Azkaban: Bonus Material",
            "Discovering the Real World of Harry Potter",
            "Harry Potter and the Chamber of Secrets",
            "Harry Potter and the Prisoner of Azkaban"]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

['Shrek 2',
 'Finding Nemo (Widescreen)',
 'Shrek (Full-screen)',
 'Spider-Man 2',
 'Shark Tale',
 'The Incredibles',
 "Harry Potter and the Sorcerer's Stone",
 'The Bourne Supremacy',
 'SpongeBob SquarePants: The Movie',
 'Garfield: The Movie',
 'Ice Age',
 'Collateral',
 'Dodgeball: A True Underdog Story',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'The Terminal',
 'The Day After Tomorrow',
 'Troy',
 'Elf',
 'The Manchurian Candidate',
 'The Princess Diaries 2: Royal Engagement',
 'Napoleon Dynamite',
 'The Notebook',
 'Lord of the Rings: The Return of the King',
 'Daddy Day Care',
 'Anchorman: The Legend of Ron Burgundy',
 'Aladdin: Platinum Edition',
 'The Chronicles of Riddick',
 "A Bug's Life"]

#### Senario 2:  Action/Adventure

In [15]:
# list_movies = ["Black Panther (2017)",
#                "Avengers, The (2012)",
#                "Avengers: Infinity War - Part I (2018)",
#                "Logan (2017)",
#                "Spider-Man (2002)",
#                "Spider-Man 3 (2007)",
#                "Spider-Man: Far from Home (2019)"]
list_movies = ["Spider-Man: The Return of the Green Goblin",
               "Spider-Man",
               "Spider-Man: The Venom Saga",
               "Spider-Man 2: Bonus Material",
               "X-Men: Evolution: Season 2"
]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

['Daredevil vs. Spiderman',
 'Scooby-Doo Meets Batman',
 'Pokemon: Mewtwo Returns',
 'Scooby-Doo',
 'Ice Age',
 'SpongeBob SquarePants: Halloween',
 'SpongeBob SquarePants: Nautical Nonsense / Sponge Buddies',
 'Jimmy Neutron: Boy Genius',
 'Scooby-Doo and the Legend of the Vampire',
 'Stuart Little 2',
 'Scooby-Doo and the Reluctant Werewolf',
 'SpongeBob SquarePants: Tales From the Deep',
 'Scooby-Doo and the Ghoul School',
 'SpongeBob SquarePants: Sea Stories',
 'Scooby-Doo and the Alien Invaders',
 'Schoolhouse Rock!: Special 30th Anniversary Edition',
 'Justice League',
 "Scooby-Doo's Spookiest Tales",
 'Harry Potter and the Chamber of Secrets',
 'Pokemon 3: The Movie',
 'Justice League: Justice on Trial',
 'Lord of the Rings: The Fellowship of the Ring',
 'Spirit: Stallion of the Cimarron',
 'Scooby-Doo and the Cyber Chase',
 'Scooby-Doo Meets the Harlem Globetrotters',
 'Transformers: The Movie',
 'Snow Dogs',
 'Scooby-Doo Goes Hollywood',
 'My Big Fat Greek Wedding']

#### Senario 3: Comedy

In [14]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

['Moana (2016)',
 'Guardians of the Galaxy 2 (2017)',
 'Nightmare Before Christmas, The (1993)',
 'Inside Out (2015)',
 'Up (2009)',
 'Breakfast Club, The (1985)',
 'Toy Story (1995)',
 "Ferris Bueller's Day Off (1986)",
 'Coco (2017)',
 'Finding Dory (2016)',
 "Bug's Life, A (1998)",
 'Pitch Perfect (2012)',
 'Clueless (1995)',
 'Untitled Spider-Man Reboot (2017)',
 'Deadpool (2016)',
 'Big Hero 6 (2014)',
 'Toy Story 2 (1999)',
 'Thor: Ragnarok (2017)',
 'Groundhog Day (1993)',
 'Forrest Gump (1994)',
 'Easy A (2010)',
 'Back to the Future Part II (1989)',
 'Scott Pilgrim vs. the World (2010)',
 'Austin Powers: International Man of Mystery (1997)',
 'Wonder Woman (2017)',
 'How to Train Your Dragon (2010)',
 '21 Jump Street (2012)',
 'Beauty and the Beast (1991)',
 'Monsters, Inc. (2001)']

### Evaluation metrics

#### Test Random userId

In [None]:
random_userId = random.choice(list(set(data.userId))) # Random choose an user
user_input = data[(data.userId == random_userId)]
if len(user_input) >= 511:
    user_input = user_input.sample(n = 511) # Random choose 511 rows
answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
final_eval['answer'] = answer
final_eval['eval'] = evaluation
final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
count, relevant = 0, 0
for idx, elm in final_eval.iterrows():
    if elm['answer'] >= 3.5: relevant += 1
    count += 1
    print(relevant/count, count)
    if count == 10: break

1.0 1
1.0 2
0.6666666666666666 3
0.5 4
0.4 5
0.5 6
0.5714285714285714 7
0.625 8
0.5555555555555556 9
0.5 10


In [72]:
top_movie

['Godzilla vs. Destroyah / Godzilla vs. Space Godzilla (Double Feature)',
 'Heartwood',
 'La Dolce Vita',
 'Suburbia',
 'Tommy Boy',
 'Dorothy L. Sayers Mysteries: Have His Carcase',
 'Blue',
 'Deterrence',
 'Jurassic Park',
 'Master and Commander: The Far Side of the World',
 'Alabama Love Story',
 'North Shore',
 'Seems Like Old Times',
 'Essence of Echoes',
 'Hidalgo',
 'Fearless Hyena 1 / Fearless Hyena 2',
 'Carnivale: Season 1',
 'The Onion Field',
 'The Man Who Shot Liberty Valance',
 "W.C. Fields: You Can't Cheat an Honest Man",
 'Blast',
 'Pokemon Heroes: The Movie',
 "The General's Daughter",
 'The Mean Season',
 'Mulholland Falls',
 'Big Fish',
 'Hote Hote Pyaar Ho Gaya',
 'Cries and Whispers',
 'The Broadway Melody of 1929',
 'Frasier: Season 5']

In [89]:
len(final_eval[final_eval.answer >= 3.5])

129

#### Test all userId

In [101]:
# k = 0 (useless) 1~10
precision_score = [0]*11 
recall_score = [0]*11
for userId in data.userId.unique():
    user_input = data[(data.userId == userId)]
    if len(user_input) >= 511:
        user_input = user_input.sample(n = 511) # Random choose 511 rows
    answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
    user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
    top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
    final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
    final_eval['answer'] = answer
    final_eval['eval'] = evaluation
    final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
    recommend_item, relevant_item, relevant_recommend = 0, len(final_eval[final_eval.answer >= 3.5]), 0 # Precision分母 Recall分母 分子兩個一樣
    for idx, elm in final_eval.iterrows():
        if elm['answer'] >= 3.5: relevant_recommend += 1
        recommend_item += 1
        precision_score[recommend_item] += relevant_recommend / recommend_item
        if relevant_item>0:
            recall_score[recommend_item] += relevant_recommend / relevant_item
        if recommend_item == 10: break
    print(userId, 'is done!')

510180 is done!
122223 is done!
204439 is done!
404067 is done!
261295 is done!
355883 is done!
640999 is done!
153083 is done!
228192 is done!
369086 is done!


KeyboardInterrupt: 

#### Precision Score

In [1]:
for i in range(1,11):
    print("k = ", i, precision_score[i] / 119857) # total value / total number of userId

NameError: name 'precision_score' is not defined

#### Recall Score

In [105]:
for i in range(1,11):
    print("k = ", i, recall_score[i] / 119857) # total value / total number of userId

0.0038190464815749912
0.009139063970878266
0.01456258723925421
0.02122620889451914
0.03074889552713169
0.033818580911227414
0.04280284570536817
0.047339235701926385
0.05583088965862872
0.06502651499753216


#### F1 Score

In [None]:
for i in range(1,11):
    prec_score = precision_score[i] / 119857
    reca_score = recall_score[i] / 119857
    print("k = ", i, (prec_score*recall_score/(prec_score+recall_score))*2) # total value / total number of userId

In [None]:
count / (len(data.userId.unique())*30) #old

0.37716361998047676

In [155]:
len(data.userId.unique())

119857