### Import Packages

In [1]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD

  from .autonotebook import tqdm as notebook_tqdm


### Check GPU is available

In [2]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


### Data processing

In [3]:
# 檔案路徑
# data_csv_path = "./netflix_prize_dataset/netflix_data_25M_drop.csv"
# movies_path = "./netflix_prize_dataset/netflix_movie2.csv"
data_csv_path = './disney_review_dataset/disney_data.csv'
movies_path = './disney_review_dataset/disney_movie.csv'
# 檢查是否是最新的check point
model_path = "./recommender_models/recommender-v5.ckpt" 
data = pd.read_csv(data_csv_path)
# data.drop(data[data.movieId == 6484].index, inplace=True)
movies = pd.read_csv(movies_path)
# data.sort_values(by="date", inplace=True) # Netflix movie
data.sort_values(by="date", inplace=True)

In [4]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [5]:
random.sample(list(grp_by_train.groups), k=10) # Test

[4087, 3250, 2207, 795, 2862, 2432, 5147, 1952, 1081, 2955]

### Model Loading

In [6]:
model = Recommender(
        # vocab_size=len(data) + 2,
        vocab_size = len(movies) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [7]:
# movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping} # old
# movie_to_idx = {a: b for a, b in zip(movies.title.tolist(), movies.movieId.tolist())}
movie_to_idx = {}
for a, b in zip(movies.movie.tolist(), movies.movieId.tolist()):
    if a in movie_to_idx:
        movie_to_idx[f'{a}_{b}'] = b
    else:
        movie_to_idx[a] = b
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

### Predict function

In [8]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data)) * 5

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred) # let the result's value in 1~5

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]], [masked_pred[i] for i in [movie_to_idx[a] for a in list_movies]]


### Output Results (old)

#### Senario 1: Adventure/Fantasy 

In [9]:
# list_movies = ["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
            #    "Harry Potter and the Chamber of Secrets (2002)",
            #    "Harry Potter and the Prisoner of Azkaban (2004)",
            #    "Harry Potter and the Goblet of Fire (2005)"]
# list_movies=["Harry Potter and the Prisoner of Azkaban: Bonus Material",
#             "Discovering the Real World of Harry Potter",
#             "Harry Potter and the Chamber of Secrets",
#             "Harry Potter and the Prisoner of Azkaban"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

#### Senario 2:  Action/Adventure

In [10]:
# list_movies = ["Black Panther (2017)",
#                "Avengers, The (2012)",
#                "Avengers: Infinity War - Part I (2018)",
#                "Logan (2017)",
#                "Spider-Man (2002)",
#                "Spider-Man 3 (2007)",
#                "Spider-Man: Far from Home (2019)"]
# list_movies = ["Spider-Man: The Return of the Green Goblin",
#                "Spider-Man",
#                "Spider-Man: The Venom Saga",
#                "Spider-Man 2: Bonus Material",
#                "X-Men: Evolution: Season 2"
# ]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

#### Senario 3: Comedy

In [11]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

#### Test Random userId

In [12]:
random_userId = random.choice(list(set(data.userId))) # Random choose an user
user_input = data[(data.userId == random_userId)]
if len(user_input) >= 511:
    user_input = user_input.sample(n = 511) # Random choose 511 rows
answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
final_eval['answer'] = answer
final_eval['eval'] = evaluation
final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
count, relevant = 0, 0
for idx, elm in final_eval.iterrows():
    if elm['answer'] >= 3.5: relevant += 1
    count += 1
    print(relevant/count, count)
    if count == 10: break

1.0 1


In [13]:
top_movie

['Spirited Away',
 'Frozen',
 'Brave',
 'Tangled',
 'Lilo & Stitch',
 'Mary Poppins',
 "The Emperor's New Groove",
 'Tron',
 'Aladdin',
 'The Little Mermaid',
 'Hercules',
 'The Aristocats',
 'Cenerentola',
 'The Jungle Book',
 'Oliver & Company',
 'The Rescuers',
 'La Spada nella Roccia',
 'The Many Adventures of Winnie the Pooh']

In [14]:
len(final_eval[final_eval.answer >= 3.5])
len(data.userId.unique())
data.userId.unique()[:20]
data[data.userId == 371560]

Unnamed: 0,rating,userId,movieId,date,movieId_mapped


#### Test all userId

In [15]:
# k = 0 (useless) 1~10
precision_score = [0]*11 
recall_score = [0]*11
for userId in data.userId.unique():
    user_input = data[(data.userId == userId)]
    print(len(user_input))
    if len(user_input) >= 511:
        user_input = user_input.sample(n = 511) # Random choose 511 rows
    answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
    user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
    top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
    final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
    final_eval['answer'] = answer
    final_eval['eval'] = evaluation
    final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
    recommend_item, relevant_item, relevant_recommend = 0, len(final_eval[final_eval.answer >= 3.5]), 0 # Precision分母 Recall分母 分子兩個一樣
    for idx, elm in final_eval.iterrows():
        if elm['answer'] >= 3.5: relevant_recommend += 1
        recommend_item += 1
        precision_score[recommend_item] += relevant_recommend / recommend_item
        if relevant_item>0:
            recall_score[recommend_item] += relevant_recommend / relevant_item
        if recommend_item == 10: break
    print(userId, 'is done!')

1
1532 is done!
1
1000 is done!
1
2547 is done!
1
960 is done!
3
314 is done!
1
1250 is done!
1
1252 is done!
1
1334 is done!
1
2075 is done!
1
1906 is done!
1
2017 is done!
1
2074 is done!
1
1905 is done!
2
466 is done!
1
1389 is done!
1
591 is done!
1
944 is done!
1
688 is done!
2
303 is done!
1
646 is done!
3
64 is done!
1
2557 is done!
1
988 is done!
1
2073 is done!
1
2523 is done!
1
1449 is done!
1
552 is done!
1
1244 is done!
1
2014 is done!
1
464 is done!
1
2315 is done!
1
2016 is done!
1
2615 is done!
1
604 is done!
1
2546 is done!
1
565 is done!
1
943 is done!
1
2013 is done!
1
2129 is done!
1
718 is done!
1
2533 is done!
1
1230 is done!
1
598 is done!
1
182 is done!
2
1485 is done!
1
376 is done!
1
2371 is done!
2
649 is done!
1
1147 is done!
1
1032 is done!
1
1204 is done!
1
659 is done!
1
2468 is done!
1
2260 is done!
2
1574 is done!
1
1146 is done!
1
992 is done!
1
803 is done!
1
902 is done!
1
2196 is done!
1
553 is done!
1
1563 is done!
1
1145 is done!
1
2356 is done!
1


#### Precision Score

In [16]:
for i in range(1,11):
    print("k = ", i, precision_score[i] / len(data.userId.unique())) # total value / total number of userId

k =  1 0.7319587628865979
k =  2 0.1002290950744559
k =  3 0.04729176894125349
k =  4 0.02953690067092129
k =  5 0.022713140238913427
k =  6 0.01745486281568755
k =  7 0.014119737241975825
k =  8 0.01102520045819015
k =  9 0.009163802978235965
k =  10 0.007609229258713791


#### Recall Score

In [17]:
for i in range(1,11):
    print("k = ", i, recall_score[i] / len(data.userId.unique())) # total value / total number of userId

k =  1 0.674645866164536
k =  2 0.08654203180322913
k =  3 0.04001572321802474
k =  4 0.02369486642025554
k =  5 0.019480473496255056
k =  6 0.01534529341605781
k =  7 0.012731167531695991
k =  8 0.010327462783135774
k =  9 0.008810294224253622
k =  10 0.007819119926024543


#### F1 Score

In [18]:
for i in range(1,11):
    prec_score = precision_score[i] / len(data.userId.unique())
    reca_score = recall_score[i] / len(data.userId.unique())
    print("k = ", i, (prec_score*reca_score/(prec_score+reca_score))*2) # total value / total number of userId

k =  1 0.7021346914199578
k =  2 0.0928840520325499
k =  3 0.043350559949461805
k =  4 0.026295310267159634
k =  5 0.020972971370408287
k =  6 0.01633223875833409
k =  7 0.013389548087584935
k =  8 0.010664931687603369
k =  9 0.008983572253094053
k =  10 0.007712746892888923


In [19]:
len(data.userId.unique())
#data.userId.max()

6111